Source code for h2o.estimators.xgboost

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

import h2o
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OXGBoostEstimator(H2OEstimator): """ XGBoost Builds an eXtreme Gradient Boosting model using the native XGBoost backend. """ algo = "xgboost" supervised_learning = True _options_ = {'model_extensions': ['h2o.model.extensions.ScoringHistoryTrees', 'h2o.model.extensions.VariableImportance', 'h2o.model.extensions.FeatureInteraction', 'h2o.model.extensions.Trees', 'h2o.model.extensions.SupervisedTrees', 'h2o.model.extensions.HStatistic', 'h2o.model.extensions.Contributions', 'h2o.model.extensions.Fairness'], 'verbose': True} def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] nfolds=0, # type: int keep_cross_validation_models=True, # type: bool keep_cross_validation_predictions=False, # type: bool keep_cross_validation_fold_assignment=False, # type: bool score_each_iteration=False, # type: bool fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"] fold_column=None, # type: Optional[str] response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] ignore_const_cols=True, # type: bool offset_column=None, # type: Optional[str] weights_column=None, # type: Optional[str] stopping_rounds=0, # type: int stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] stopping_tolerance=0.001, # type: float max_runtime_secs=0.0, # type: float seed=-1, # type: int distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] tweedie_power=1.5, # type: float categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] quiet_mode=True, # type: bool checkpoint=None, # type: Optional[Union[None, str, H2OEstimator]] export_checkpoints_dir=None, # type: Optional[str] custom_metric_func=None, # type: Optional[str] ntrees=50, # type: int max_depth=6, # type: int min_rows=1.0, # type: float min_child_weight=1.0, # type: float learn_rate=0.3, # type: float eta=0.3, # type: float sample_rate=1.0, # type: float subsample=1.0, # type: float col_sample_rate=1.0, # type: float colsample_bylevel=1.0, # type: float col_sample_rate_per_tree=1.0, # type: float colsample_bytree=1.0, # type: float colsample_bynode=1.0, # type: float max_abs_leafnode_pred=0.0, # type: float max_delta_step=0.0, # type: float monotone_constraints=None, # type: Optional[dict] interaction_constraints=None, # type: Optional[List[List[str]]] score_tree_interval=0, # type: int min_split_improvement=0.0, # type: float gamma=0.0, # type: float nthread=-1, # type: int save_matrix_directory=None, # type: Optional[str] build_tree_one_node=False, # type: bool parallelize_cross_validation=True, # type: bool calibrate_model=False, # type: bool calibration_frame=None, # type: Optional[Union[None, str, H2OFrame]] calibration_method="auto", # type: Literal["auto", "platt_scaling", "isotonic_regression"] max_bins=256, # type: int max_leaves=0, # type: int sample_type="uniform", # type: Literal["uniform", "weighted"] normalize_type="tree", # type: Literal["tree", "forest"] rate_drop=0.0, # type: float one_drop=False, # type: bool skip_drop=0.0, # type: float tree_method="auto", # type: Literal["auto", "exact", "approx", "hist"] grow_policy="depthwise", # type: Literal["depthwise", "lossguide"] booster="gbtree", # type: Literal["gbtree", "gblinear", "dart"] reg_lambda=1.0, # type: float reg_alpha=0.0, # type: float dmatrix_type="auto", # type: Literal["auto", "dense", "sparse"] backend="auto", # type: Literal["auto", "gpu", "cpu"] gpu_id=None, # type: Optional[List[int]] gainslift_bins=-1, # type: int auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] scale_pos_weight=1.0, # type: float eval_metric=None, # type: Optional[str] score_eval_metric_only=False, # type: bool ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2). Defaults to ``0``. :type nfolds: int :param keep_cross_validation_models: Whether to keep the cross-validation models. Defaults to ``True``. :type keep_cross_validation_models: bool :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models. Defaults to ``False``. :type keep_cross_validation_predictions: bool :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment. Defaults to ``False``. :type keep_cross_validation_fold_assignment: bool :param score_each_iteration: Whether to score during each iteration of model training. Defaults to ``False``. :type score_each_iteration: bool :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Defaults to ``"auto"``. :type fold_assignment: Literal["auto", "random", "modulo", "stratified"] :param fold_column: Column with cross-validation fold index assignment per observation. Defaults to ``None``. :type fold_column: str, optional :param response_column: Response variable column. Defaults to ``None``. :type response_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool :param offset_column: Offset column. This will be added to the combination of columns before applying the link function. Defaults to ``None``. :type offset_column: str, optional :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Defaults to ``None``. :type weights_column: str, optional :param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Defaults to ``0``. :type stopping_rounds: int :param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Defaults to ``"auto"``. :type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] :param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Defaults to ``0.001``. :type stopping_tolerance: float :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param seed: Seed for pseudo random number generator (if applicable) Defaults to ``-1``. :type seed: int :param distribution: Distribution function Defaults to ``"auto"``. :type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] :param tweedie_power: Tweedie power for Tweedie regression, must be between 1 and 2. Defaults to ``1.5``. :type tweedie_power: float :param categorical_encoding: Encoding scheme for categorical features Defaults to ``"auto"``. :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] :param quiet_mode: Enable quiet mode Defaults to ``True``. :type quiet_mode: bool :param checkpoint: Model checkpoint to resume training with. Defaults to ``None``. :type checkpoint: Union[None, str, H2OEstimator], optional :param export_checkpoints_dir: Automatically export generated models to this directory. Defaults to ``None``. :type export_checkpoints_dir: str, optional :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName` Defaults to ``None``. :type custom_metric_func: str, optional :param ntrees: (same as n_estimators) Number of trees. Defaults to ``50``. :type ntrees: int :param max_depth: Maximum tree depth (0 for unlimited). Defaults to ``6``. :type max_depth: int :param min_rows: (same as min_child_weight) Fewest allowed (weighted) observations in a leaf. Defaults to ``1.0``. :type min_rows: float :param min_child_weight: (same as min_rows) Fewest allowed (weighted) observations in a leaf. Defaults to ``1.0``. :type min_child_weight: float :param learn_rate: (same as eta) Learning rate (from 0.0 to 1.0) Defaults to ``0.3``. :type learn_rate: float :param eta: (same as learn_rate) Learning rate (from 0.0 to 1.0) Defaults to ``0.3``. :type eta: float :param sample_rate: (same as subsample) Row sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type sample_rate: float :param subsample: (same as sample_rate) Row sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type subsample: float :param col_sample_rate: (same as colsample_bylevel) Column sample rate (from 0.0 to 1.0) Defaults to ``1.0``. :type col_sample_rate: float :param colsample_bylevel: (same as col_sample_rate) Column sample rate (from 0.0 to 1.0) Defaults to ``1.0``. :type colsample_bylevel: float :param col_sample_rate_per_tree: (same as colsample_bytree) Column sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type col_sample_rate_per_tree: float :param colsample_bytree: (same as col_sample_rate_per_tree) Column sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type colsample_bytree: float :param colsample_bynode: Column sample rate per tree node (from 0.0 to 1.0) Defaults to ``1.0``. :type colsample_bynode: float :param max_abs_leafnode_pred: (same as max_delta_step) Maximum absolute value of a leaf node prediction Defaults to ``0.0``. :type max_abs_leafnode_pred: float :param max_delta_step: (same as max_abs_leafnode_pred) Maximum absolute value of a leaf node prediction Defaults to ``0.0``. :type max_delta_step: float :param monotone_constraints: A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. Defaults to ``None``. :type monotone_constraints: dict, optional :param interaction_constraints: A set of allowed column interactions. Defaults to ``None``. :type interaction_constraints: List[List[str]], optional :param score_tree_interval: Score the model after every so many trees. Disabled if set to 0. Defaults to ``0``. :type score_tree_interval: int :param min_split_improvement: (same as gamma) Minimum relative improvement in squared error reduction for a split to happen Defaults to ``0.0``. :type min_split_improvement: float :param gamma: (same as min_split_improvement) Minimum relative improvement in squared error reduction for a split to happen Defaults to ``0.0``. :type gamma: float :param nthread: Number of parallel threads that can be used to run XGBoost. Cannot exceed H2O cluster limits (-nthreads parameter). Defaults to maximum available Defaults to ``-1``. :type nthread: int :param save_matrix_directory: Directory where to save matrices passed to XGBoost library. Useful for debugging. Defaults to ``None``. :type save_matrix_directory: str, optional :param build_tree_one_node: Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Defaults to ``False``. :type build_tree_one_node: bool :param parallelize_cross_validation: Allow parallel training of cross-validation models Defaults to ``True``. :type parallelize_cross_validation: bool :param calibrate_model: Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Defaults to ``False``. :type calibrate_model: bool :param calibration_frame: Data for model calibration Defaults to ``None``. :type calibration_frame: Union[None, str, H2OFrame], optional :param calibration_method: Calibration method to use Defaults to ``"auto"``. :type calibration_method: Literal["auto", "platt_scaling", "isotonic_regression"] :param max_bins: For tree_method=hist only: maximum number of bins Defaults to ``256``. :type max_bins: int :param max_leaves: For tree_method=hist only: maximum number of leaves Defaults to ``0``. :type max_leaves: int :param sample_type: For booster=dart only: sample_type Defaults to ``"uniform"``. :type sample_type: Literal["uniform", "weighted"] :param normalize_type: For booster=dart only: normalize_type Defaults to ``"tree"``. :type normalize_type: Literal["tree", "forest"] :param rate_drop: For booster=dart only: rate_drop (0..1) Defaults to ``0.0``. :type rate_drop: float :param one_drop: For booster=dart only: one_drop Defaults to ``False``. :type one_drop: bool :param skip_drop: For booster=dart only: skip_drop (0..1) Defaults to ``0.0``. :type skip_drop: float :param tree_method: Tree method Defaults to ``"auto"``. :type tree_method: Literal["auto", "exact", "approx", "hist"] :param grow_policy: Grow policy - depthwise is standard GBM, lossguide is LightGBM Defaults to ``"depthwise"``. :type grow_policy: Literal["depthwise", "lossguide"] :param booster: Booster type Defaults to ``"gbtree"``. :type booster: Literal["gbtree", "gblinear", "dart"] :param reg_lambda: L2 regularization Defaults to ``1.0``. :type reg_lambda: float :param reg_alpha: L1 regularization Defaults to ``0.0``. :type reg_alpha: float :param dmatrix_type: Type of DMatrix. For sparse, NAs and 0 are treated equally. Defaults to ``"auto"``. :type dmatrix_type: Literal["auto", "dense", "sparse"] :param backend: Backend. By default (auto), a GPU is used if available. Defaults to ``"auto"``. :type backend: Literal["auto", "gpu", "cpu"] :param gpu_id: Which GPU(s) to use. Defaults to ``None``. :type gpu_id: List[int], optional :param gainslift_bins: Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Defaults to ``-1``. :type gainslift_bins: int :param auc_type: Set default multinomial AUC type. Defaults to ``"auto"``. :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] :param scale_pos_weight: Controls the effect of observations with positive labels in relation to the observations with negative labels on gradient calculation. Useful for imbalanced problems. Defaults to ``1.0``. :type scale_pos_weight: float :param eval_metric: Specification of evaluation metric that will be passed to the native XGBoost backend. Defaults to ``None``. :type eval_metric: str, optional :param score_eval_metric_only: If enabled, score only the evaluation metric. This can make model training faster if scoring is frequent (eg. each iteration). Defaults to ``False``. :type score_eval_metric_only: bool """ super(H2OXGBoostEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.nfolds = nfolds self.keep_cross_validation_models = keep_cross_validation_models self.keep_cross_validation_predictions = keep_cross_validation_predictions self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment self.score_each_iteration = score_each_iteration self.fold_assignment = fold_assignment self.fold_column = fold_column self.response_column = response_column self.ignored_columns = ignored_columns self.ignore_const_cols = ignore_const_cols self.offset_column = offset_column self.weights_column = weights_column self.stopping_rounds = stopping_rounds self.stopping_metric = stopping_metric self.stopping_tolerance = stopping_tolerance self.max_runtime_secs = max_runtime_secs self.seed = seed self.distribution = distribution self.tweedie_power = tweedie_power self.categorical_encoding = categorical_encoding self.quiet_mode = quiet_mode self.checkpoint = checkpoint self.export_checkpoints_dir = export_checkpoints_dir self.custom_metric_func = custom_metric_func self.ntrees = ntrees self.max_depth = max_depth self.min_rows = min_rows self.min_child_weight = min_child_weight self.learn_rate = learn_rate self.eta = eta self.sample_rate = sample_rate self.subsample = subsample self.col_sample_rate = col_sample_rate self.colsample_bylevel = colsample_bylevel self.col_sample_rate_per_tree = col_sample_rate_per_tree self.colsample_bytree = colsample_bytree self.colsample_bynode = colsample_bynode self.max_abs_leafnode_pred = max_abs_leafnode_pred self.max_delta_step = max_delta_step self.monotone_constraints = monotone_constraints self.interaction_constraints = interaction_constraints self.score_tree_interval = score_tree_interval self.min_split_improvement = min_split_improvement self.gamma = gamma self.nthread = nthread self.save_matrix_directory = save_matrix_directory self.build_tree_one_node = build_tree_one_node self.parallelize_cross_validation = parallelize_cross_validation self.calibrate_model = calibrate_model self.calibration_frame = calibration_frame self.calibration_method = calibration_method self.max_bins = max_bins self.max_leaves = max_leaves self.sample_type = sample_type self.normalize_type = normalize_type self.rate_drop = rate_drop self.one_drop = one_drop self.skip_drop = skip_drop self.tree_method = tree_method self.grow_policy = grow_policy self.booster = booster self.reg_lambda = reg_lambda self.reg_alpha = reg_alpha self.dmatrix_type = dmatrix_type self.backend = backend self.gpu_id = gpu_id self.gainslift_bins = gainslift_bins self.auc_type = auc_type self.scale_pos_weight = scale_pos_weight self.eval_metric = eval_metric self.score_eval_metric_only = score_eval_metric_only @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv") >>> insurance['Group'] = insurance['Group'].asfactor() >>> insurance['Age'] = insurance['Age'].asfactor() >>> predictors = insurance.columns[0:4] >>> response = 'Claims' >>> train, valid = insurance.split_frame(ratios=[.8], ... seed=1234) >>> insurance_xgb = H2OXGBoostEstimator(seed=1234) >>> insurance_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(insurance_xgb.mse(valid=True)) """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def nfolds(self): """ Number of folds for K-fold cross-validation (0 to disable or >= 2). Type: ``int``, defaults to ``0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> folds = 5 >>> titanic_xgb = H2OXGBoostEstimator(nfolds=folds, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_xgb.auc(xval=True) """ return self._parms.get("nfolds") @nfolds.setter def nfolds(self, nfolds): assert_is_type(nfolds, None, int) self._parms["nfolds"] = nfolds @property def keep_cross_validation_models(self): """ Whether to keep the cross-validation models. Type: ``bool``, defaults to ``True``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_models=True, ... nfolds=5 , ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> titanic_xgb.cross_validation_models() """ return self._parms.get("keep_cross_validation_models") @keep_cross_validation_models.setter def keep_cross_validation_models(self, keep_cross_validation_models): assert_is_type(keep_cross_validation_models, None, bool) self._parms["keep_cross_validation_models"] = keep_cross_validation_models @property def keep_cross_validation_predictions(self): """ Whether to keep the predictions of the cross-validation models. Type: ``bool``, defaults to ``False``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_predictions=True, ... nfolds=5, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> titanic_xgb.cross_validation_predictions() """ return self._parms.get("keep_cross_validation_predictions") @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, keep_cross_validation_predictions): assert_is_type(keep_cross_validation_predictions, None, bool) self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions @property def keep_cross_validation_fold_assignment(self): """ Whether to keep the cross-validation fold assignment. Type: ``bool``, defaults to ``False``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(keep_cross_validation_fold_assignment=True, ... nfolds=5, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> titanic_xgb.cross_validation_fold_assignment() """ return self._parms.get("keep_cross_validation_fold_assignment") @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment): assert_is_type(keep_cross_validation_fold_assignment, None, bool) self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool``, defaults to ``False``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(score_each_iteration=True, ... ntrees=55, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.scoring_history() """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def fold_assignment(self): """ Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> assignment_type = "Random" >>> titanic_xgb = H2OXGBoostEstimator(fold_assignment=assignment_type, ... nfolds=5, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic) >>> titanic_xgb.auc(xval=True) """ return self._parms.get("fold_assignment") @fold_assignment.setter def fold_assignment(self, fold_assignment): assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified")) self._parms["fold_assignment"] = fold_assignment @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> fold_numbers = titanic.kfold_column(n_folds=5, ... seed=1234) >>> fold_numbers.set_names(["fold_numbers"]) >>> titanic = titanic.cbind(fold_numbers) >>> print(titanic['fold_numbers']) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic, ... fold_column="fold_numbers") >>> titanic_xgb.auc(xval=True) """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool``, defaults to ``True``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> titanic["const_1"] = 6 >>> titanic["const_2"] = 7 >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234, ... ignore_const_cols=True) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def offset_column(self): """ Offset column. This will be added to the combination of columns before applying the link function. Type: ``str``. """ return self._parms.get("offset_column") @offset_column.setter def offset_column(self, offset_column): assert_is_type(offset_column, None, str) self._parms["offset_column"] = offset_column @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Type: ``str``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def stopping_rounds(self): """ Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Type: ``int``, defaults to ``0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("stopping_rounds") @stopping_rounds.setter def stopping_rounds(self, stopping_rounds): assert_is_type(stopping_rounds, None, int) self._parms["stopping_rounds"] = stopping_rounds @property def stopping_metric(self): """ Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("stopping_metric") @stopping_metric.setter def stopping_metric(self, stopping_metric): assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing")) self._parms["stopping_metric"] = stopping_metric @property def stopping_tolerance(self): """ Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Type: ``float``, defaults to ``0.001``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("stopping_tolerance") @stopping_tolerance.setter def stopping_tolerance(self, stopping_tolerance): assert_is_type(stopping_tolerance, None, numeric) self._parms["stopping_tolerance"] = stopping_tolerance @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_runtime_secs=10, ... ntrees=10000, ... max_depth=10, ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def seed(self): """ Seed for pseudo random number generator (if applicable) Type: ``int``, defaults to ``-1``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> xgb_w_seed_1 = H2OXGBoostEstimator(col_sample_rate=.7, ... seed=1234) >>> xgb_w_seed_1.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> xgb_w_seed_2 = H2OXGBoostEstimator(col_sample_rate = .7, ... seed = 1234) >>> xgb_w_seed_2.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('auc for the 1st model built with a seed:', ... xgb_w_seed_1.auc(valid=True)) >>> print('auc for the 2nd model built with a seed:', ... xgb_w_seed_2.auc(valid=True)) """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def distribution(self): """ Distribution function Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]``, defaults to ``"auto"``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_xgb = H2OXGBoostEstimator(distribution="poisson", ... seed=1234) >>> cars_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_xgb.mse(valid=True) """ return self._parms.get("distribution") @distribution.setter def distribution(self, distribution): assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber")) self._parms["distribution"] = distribution @property def tweedie_power(self): """ Tweedie power for Tweedie regression, must be between 1 and 2. Type: ``float``, defaults to ``1.5``. :examples: >>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv") >>> predictors = insurance.columns[0:4] >>> response = 'Claims' >>> insurance['Group'] = insurance['Group'].asfactor() >>> insurance['Age'] = insurance['Age'].asfactor() >>> train, valid = insurance.split_frame(ratios=[.8], ... seed=1234) >>> insurance_xgb = H2OXGBoostEstimator(distribution="tweedie", ... tweedie_power=1.2, ... seed=1234) >>> insurance_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(insurance_xgb.mse(valid=True)) """ return self._parms.get("tweedie_power") @tweedie_power.setter def tweedie_power(self, tweedie_power): assert_is_type(tweedie_power, None, numeric) self._parms["tweedie_power"] = tweedie_power @property def categorical_encoding(self): """ Encoding scheme for categorical features Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> encoding = "one_hot_explicit" >>> airlines_xgb = H2OXGBoostEstimator(categorical_encoding=encoding, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.auc(valid=True) """ return self._parms.get("categorical_encoding") @categorical_encoding.setter def categorical_encoding(self, categorical_encoding): assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) self._parms["categorical_encoding"] = categorical_encoding @property def quiet_mode(self): """ Enable quiet mode Type: ``bool``, defaults to ``True``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234, quiet_mode=True) >>> titanic_xgb.train(x=predictors ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.mse(valid=True) """ return self._parms.get("quiet_mode") @quiet_mode.setter def quiet_mode(self, quiet_mode): assert_is_type(quiet_mode, None, bool) self._parms["quiet_mode"] = quiet_mode @property def checkpoint(self): """ Model checkpoint to resume training with. Type: ``Union[None, str, H2OEstimator]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","year","economy_20mpg"] >>> response = "acceleration" >>> from h2o.estimators import H2OXGBoostEstimator >>> cars_xgb = H2OXGBoostEstimator(seed=1234) >>> train, valid = cars.split_frame(ratios=[.8]) >>> cars_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_xgb.mse() >>> cars_xgb_continued = H2OXGBoostEstimator(checkpoint=cars_xgb.model_id, ... ntrees=51, ... seed=1234) >>> cars_xgb_continued.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_xgb_continued.mse() """ return self._parms.get("checkpoint") @checkpoint.setter def checkpoint(self, checkpoint): assert_is_type(checkpoint, None, str, H2OEstimator) self._parms["checkpoint"] = checkpoint @property def export_checkpoints_dir(self): """ Automatically export generated models to this directory. Type: ``str``. :examples: >>> import tempfile >>> from h2o.grid.grid_search import H2OGridSearch >>> from os import listdir >>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex") >>> predictors = ["DayofMonth", "DayOfWeek"] >>> response = "IsDepDelayed" >>> hyper_parameters = {'ntrees': [5,10]} >>> search_crit = {'strategy': "RandomDiscrete", ... 'max_models': 5, ... 'seed': 1234, ... 'stopping_rounds': 3, ... 'stopping_metric': "AUTO", ... 'stopping_tolerance': 1e-2} >>> checkpoints_dir = tempfile.mkdtemp() >>> air_grid = H2OGridSearch(H2OXGBoostEstimator, ... hyper_params=hyper_parameters, ... search_criteria=search_crit) >>> air_grid.train(x=predictors, ... y=response, ... training_frame=airlines, ... distribution="bernoulli", ... learn_rate=0.1, ... max_depth=3, ... export_checkpoints_dir=checkpoints_dir) >>> len(listdir(checkpoints_dir)) """ return self._parms.get("export_checkpoints_dir") @export_checkpoints_dir.setter def export_checkpoints_dir(self, export_checkpoints_dir): assert_is_type(export_checkpoints_dir, None, str) self._parms["export_checkpoints_dir"] = export_checkpoints_dir @property def custom_metric_func(self): """ Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. """ return self._parms.get("custom_metric_func") @custom_metric_func.setter def custom_metric_func(self, custom_metric_func): assert_is_type(custom_metric_func, None, str) self._parms["custom_metric_func"] = custom_metric_func @property def ntrees(self): """ (same as n_estimators) Number of trees. Type: ``int``, defaults to ``50``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> tree_num = [20, 50, 80, 110, 140, 170, 200] >>> label = ["20", "50", "80", "110", ... "140", "170", "200"] >>> for key, num in enumerate(tree_num): # Input integer for 'num' and 'key' >>> titanic_xgb = H2OXGBoostEstimator(ntrees=num, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(label[key], 'training score', ... titanic_xgb.auc(train=True)) >>> print(label[key], 'validation score', ... titanic_xgb.auc(valid=True)) """ return self._parms.get("ntrees") @ntrees.setter def ntrees(self, ntrees): assert_is_type(ntrees, None, int) self._parms["ntrees"] = ntrees @property def max_depth(self): """ Maximum tree depth (0 for unlimited). Type: ``int``, defaults to ``6``. :examples: >>> df = h2o.import_file(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> response = "survived" >>> df[response] = df[response].asfactor() >>> predictors = df.columns >>> del predictors[1:3] >>> train, valid, test = df.split_frame(ratios=[0.6,0.2], ... seed=1234, ... destination_frames= ... ['train.hex', ... 'valid.hex', ... 'test.hex']) >>> xgb = H2OXGBoostEstimator() >>> xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> perf = xgb.model_performance(valid) >>> print(perf.auc()) """ return self._parms.get("max_depth") @max_depth.setter def max_depth(self, max_depth): assert_is_type(max_depth, None, int) self._parms["max_depth"] = max_depth @property def min_rows(self): """ (same as min_child_weight) Fewest allowed (weighted) observations in a leaf. Type: ``float``, defaults to ``1.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_rows=16, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("min_rows") @min_rows.setter def min_rows(self, min_rows): assert_is_type(min_rows, None, numeric) self._parms["min_rows"] = min_rows @property def min_child_weight(self): """ (same as min_rows) Fewest allowed (weighted) observations in a leaf. Type: ``float``, defaults to ``1.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_child_weight=16, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("min_child_weight") @min_child_weight.setter def min_child_weight(self, min_child_weight): assert_is_type(min_child_weight, None, numeric) self._parms["min_child_weight"] = min_child_weight @property def learn_rate(self): """ (same as eta) Learning rate (from 0.0 to 1.0) Type: ``float``, defaults to ``0.3``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000, ... learn_rate=0.01, ... stopping_rounds=5, ... stopping_metric="AUC", ... stopping_tolerance=1e-4, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("learn_rate") @learn_rate.setter def learn_rate(self, learn_rate): assert_is_type(learn_rate, None, numeric) self._parms["learn_rate"] = learn_rate @property def eta(self): """ (same as learn_rate) Learning rate (from 0.0 to 1.0) Type: ``float``, defaults to ``0.3``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(ntrees=10000, ... learn_rate=0.01, ... stopping_rounds=5, ... stopping_metric="AUC", ... stopping_tolerance=1e-4, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("eta") @eta.setter def eta(self, eta): assert_is_type(eta, None, numeric) self._parms["eta"] = eta @property def sample_rate(self): """ (same as subsample) Row sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("sample_rate") @sample_rate.setter def sample_rate(self, sample_rate): assert_is_type(sample_rate, None, numeric) self._parms["sample_rate"] = sample_rate @property def subsample(self): """ (same as sample_rate) Row sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("subsample") @subsample.setter def subsample(self, subsample): assert_is_type(subsample, None, numeric) self._parms["subsample"] = subsample @property def col_sample_rate(self): """ (same as colsample_bylevel) Column sample rate (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("col_sample_rate") @col_sample_rate.setter def col_sample_rate(self, col_sample_rate): assert_is_type(col_sample_rate, None, numeric) self._parms["col_sample_rate"] = col_sample_rate @property def colsample_bylevel(self): """ (same as col_sample_rate) Column sample rate (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("colsample_bylevel") @colsample_bylevel.setter def colsample_bylevel(self, colsample_bylevel): assert_is_type(colsample_bylevel, None, numeric) self._parms["colsample_bylevel"] = colsample_bylevel @property def col_sample_rate_per_tree(self): """ (same as colsample_bytree) Column sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("col_sample_rate_per_tree") @col_sample_rate_per_tree.setter def col_sample_rate_per_tree(self, col_sample_rate_per_tree): assert_is_type(col_sample_rate_per_tree, None, numeric) self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree @property def colsample_bytree(self): """ (same as col_sample_rate_per_tree) Column sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(col_sample_rate_per_tree=.7, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("colsample_bytree") @colsample_bytree.setter def colsample_bytree(self, colsample_bytree): assert_is_type(colsample_bytree, None, numeric) self._parms["colsample_bytree"] = colsample_bytree @property def colsample_bynode(self): """ Column sample rate per tree node (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(colsample_bynode=.5, ... seed=1234) >>> airlines_xgb.train(x=predictors, y=response, ... training_frame=train, validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("colsample_bynode") @colsample_bynode.setter def colsample_bynode(self, colsample_bynode): assert_is_type(colsample_bynode, None, numeric) self._parms["colsample_bynode"] = colsample_bynode @property def max_abs_leafnode_pred(self): """ (same as max_delta_step) Maximum absolute value of a leaf node prediction Type: ``float``, defaults to ``0.0``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_abs_leafnode_pred=float(2), ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_abs_leafnode_pred") @max_abs_leafnode_pred.setter def max_abs_leafnode_pred(self, max_abs_leafnode_pred): assert_is_type(max_abs_leafnode_pred, None, float) self._parms["max_abs_leafnode_pred"] = max_abs_leafnode_pred @property def max_delta_step(self): """ (same as max_abs_leafnode_pred) Maximum absolute value of a leaf node prediction Type: ``float``, defaults to ``0.0``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_delta_step=float(2), ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_delta_step") @max_delta_step.setter def max_delta_step(self, max_delta_step): assert_is_type(max_delta_step, None, float) self._parms["max_delta_step"] = max_delta_step @property def monotone_constraints(self): """ A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. Type: ``dict``. :examples: >>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor() >>> response = "CAPSULE" >>> seed=42 >>> monotone_constraints={"AGE":1} >>> xgb_model = H2OXGBoostEstimator(seed=seed, ... monotone_constraints=monotone_constraints) >>> xgb_model.train(y=response, ... ignored_columns=["ID"], ... training_frame=prostate_hex) >>> xgb_model.scoring_history() """ return self._parms.get("monotone_constraints") @monotone_constraints.setter def monotone_constraints(self, monotone_constraints): assert_is_type(monotone_constraints, None, dict) self._parms["monotone_constraints"] = monotone_constraints @property def interaction_constraints(self): """ A set of allowed column interactions. Type: ``List[List[str]]``. """ return self._parms.get("interaction_constraints") @interaction_constraints.setter def interaction_constraints(self, interaction_constraints): assert_is_type(interaction_constraints, None, [[str]]) self._parms["interaction_constraints"] = interaction_constraints @property def score_tree_interval(self): """ Score the model after every so many trees. Disabled if set to 0. Type: ``int``, defaults to ``0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(score_tree_interval=5, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_xgb.scoring_history() """ return self._parms.get("score_tree_interval") @score_tree_interval.setter def score_tree_interval(self, score_tree_interval): assert_is_type(score_tree_interval, None, int) self._parms["score_tree_interval"] = score_tree_interval @property def min_split_improvement(self): """ (same as gamma) Minimum relative improvement in squared error reduction for a split to happen Type: ``float``, defaults to ``0.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=0.55, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("min_split_improvement") @min_split_improvement.setter def min_split_improvement(self, min_split_improvement): assert_is_type(min_split_improvement, None, float) self._parms["min_split_improvement"] = min_split_improvement @property def gamma(self): """ (same as min_split_improvement) Minimum relative improvement in squared error reduction for a split to happen Type: ``float``, defaults to ``0.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(min_split_improvement=1e-3, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("gamma") @gamma.setter def gamma(self, gamma): assert_is_type(gamma, None, float) self._parms["gamma"] = gamma @property def nthread(self): """ Number of parallel threads that can be used to run XGBoost. Cannot exceed H2O cluster limits (-nthreads parameter). Defaults to maximum available Type: ``int``, defaults to ``-1``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> thread = 4 >>> titanic_xgb = H2OXGBoostEstimator(nthread=thread, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=titanic) >>> print(titanic_xgb.auc(train=True)) """ return self._parms.get("nthread") @nthread.setter def nthread(self, nthread): assert_is_type(nthread, None, int) self._parms["nthread"] = nthread @property def save_matrix_directory(self): """ Directory where to save matrices passed to XGBoost library. Useful for debugging. Type: ``str``. """ return self._parms.get("save_matrix_directory") @save_matrix_directory.setter def save_matrix_directory(self, save_matrix_directory): assert_is_type(save_matrix_directory, None, str) self._parms["save_matrix_directory"] = save_matrix_directory @property def build_tree_one_node(self): """ Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Type: ``bool``, defaults to ``False``. """ return self._parms.get("build_tree_one_node") @build_tree_one_node.setter def build_tree_one_node(self, build_tree_one_node): assert_is_type(build_tree_one_node, None, bool) self._parms["build_tree_one_node"] = build_tree_one_node @property def parallelize_cross_validation(self): """ Allow parallel training of cross-validation models Type: ``bool``, defaults to ``True``. """ return self._parms.get("parallelize_cross_validation") @parallelize_cross_validation.setter def parallelize_cross_validation(self, parallelize_cross_validation): assert_is_type(parallelize_cross_validation, None, bool) self._parms["parallelize_cross_validation"] = parallelize_cross_validation @property def calibrate_model(self): """ Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Type: ``bool``, defaults to ``False``. """ return self._parms.get("calibrate_model") @calibrate_model.setter def calibrate_model(self, calibrate_model): assert_is_type(calibrate_model, None, bool) self._parms["calibrate_model"] = calibrate_model @property def calibration_frame(self): """ Data for model calibration Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("calibration_frame") @calibration_frame.setter def calibration_frame(self, calibration_frame): self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame') @property def calibration_method(self): """ Calibration method to use Type: ``Literal["auto", "platt_scaling", "isotonic_regression"]``, defaults to ``"auto"``. """ return self._parms.get("calibration_method") @calibration_method.setter def calibration_method(self, calibration_method): assert_is_type(calibration_method, None, Enum("auto", "platt_scaling", "isotonic_regression")) self._parms["calibration_method"] = calibration_method @property def max_bins(self): """ For tree_method=hist only: maximum number of bins Type: ``int``, defaults to ``256``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], ... seed=1234) >>> cov_xgb = H2OXGBoostEstimator(max_bins=200, ... seed=1234) >>> cov_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cov_xgb.logloss(valid=True)) """ return self._parms.get("max_bins") @max_bins.setter def max_bins(self, max_bins): assert_is_type(max_bins, None, int) self._parms["max_bins"] = max_bins @property def max_leaves(self): """ For tree_method=hist only: maximum number of leaves Type: ``int``, defaults to ``0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(max_leaves=0, seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("max_leaves") @max_leaves.setter def max_leaves(self, max_leaves): assert_is_type(max_leaves, None, int) self._parms["max_leaves"] = max_leaves @property def sample_type(self): """ For booster=dart only: sample_type Type: ``Literal["uniform", "weighted"]``, defaults to ``"uniform"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"]= airlines["Year"].asfactor() >>> airlines["Month"]= airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(sample_type="weighted", ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("sample_type") @sample_type.setter def sample_type(self, sample_type): assert_is_type(sample_type, None, Enum("uniform", "weighted")) self._parms["sample_type"] = sample_type @property def normalize_type(self): """ For booster=dart only: normalize_type Type: ``Literal["tree", "forest"]``, defaults to ``"tree"``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(booster='dart', ... normalize_type="tree", ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("normalize_type") @normalize_type.setter def normalize_type(self, normalize_type): assert_is_type(normalize_type, None, Enum("tree", "forest")) self._parms["normalize_type"] = normalize_type @property def rate_drop(self): """ For booster=dart only: rate_drop (0..1) Type: ``float``, defaults to ``0.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(rate_drop=0.1, seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("rate_drop") @rate_drop.setter def rate_drop(self, rate_drop): assert_is_type(rate_drop, None, float) self._parms["rate_drop"] = rate_drop @property def one_drop(self): """ For booster=dart only: one_drop Type: ``bool``, defaults to ``False``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(booster='dart', ... one_drop=True, ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("one_drop") @one_drop.setter def one_drop(self, one_drop): assert_is_type(one_drop, None, bool) self._parms["one_drop"] = one_drop @property def skip_drop(self): """ For booster=dart only: skip_drop (0..1) Type: ``float``, defaults to ``0.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> airlines_xgb = H2OXGBoostEstimator(skip_drop=0.5, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train) >>> airlines_xgb.auc(train=True) """ return self._parms.get("skip_drop") @skip_drop.setter def skip_drop(self, skip_drop): assert_is_type(skip_drop, None, float) self._parms["skip_drop"] = skip_drop @property def tree_method(self): """ Tree method Type: ``Literal["auto", "exact", "approx", "hist"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], ... seed=1234) >>> >>> airlines_xgb = H2OXGBoostEstimator(seed=1234, ... tree_method="approx") >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("tree_method") @tree_method.setter def tree_method(self, tree_method): assert_is_type(tree_method, None, Enum("auto", "exact", "approx", "hist")) self._parms["tree_method"] = tree_method @property def grow_policy(self): """ Grow policy - depthwise is standard GBM, lossguide is LightGBM Type: ``Literal["depthwise", "lossguide"]``, defaults to ``"depthwise"``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> titanic["const_1"] = 6 >>> titanic["const_2"] = 7 >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(seed=1234, ... grow_policy="depthwise") >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_xgb.auc(valid=True) """ return self._parms.get("grow_policy") @grow_policy.setter def grow_policy(self, grow_policy): assert_is_type(grow_policy, None, Enum("depthwise", "lossguide")) self._parms["grow_policy"] = grow_policy @property def booster(self): """ Booster type Type: ``Literal["gbtree", "gblinear", "dart"]``, defaults to ``"gbtree"``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], ... seed=1234) >>> titanic_xgb = H2OXGBoostEstimator(booster='dart', ... normalize_type="tree", ... seed=1234) >>> titanic_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(titanic_xgb.auc(valid=True)) """ return self._parms.get("booster") @booster.setter def booster(self, booster): assert_is_type(booster, None, Enum("gbtree", "gblinear", "dart")) self._parms["booster"] = booster @property def reg_lambda(self): """ L2 regularization Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8]) >>> airlines_xgb = H2OXGBoostEstimator(reg_lambda=.0001, ... seed=1234) >>> airlines_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(airlines_xgb.auc(valid=True)) """ return self._parms.get("reg_lambda") @reg_lambda.setter def reg_lambda(self, reg_lambda): assert_is_type(reg_lambda, None, float) self._parms["reg_lambda"] = reg_lambda @property def reg_alpha(self): """ L1 regularization Type: ``float``, defaults to ``0.0``. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(reg_alpha=.25) >>> boston_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(boston_xgb.mse(valid=True)) """ return self._parms.get("reg_alpha") @reg_alpha.setter def reg_alpha(self, reg_alpha): assert_is_type(reg_alpha, None, float) self._parms["reg_alpha"] = reg_alpha @property def dmatrix_type(self): """ Type of DMatrix. For sparse, NAs and 0 are treated equally. Type: ``Literal["auto", "dense", "sparse"]``, defaults to ``"auto"``. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(dmatrix_type="auto", ... seed=1234) >>> boston_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> boston_xgb.mse() """ return self._parms.get("dmatrix_type") @dmatrix_type.setter def dmatrix_type(self, dmatrix_type): assert_is_type(dmatrix_type, None, Enum("auto", "dense", "sparse")) self._parms["dmatrix_type"] = dmatrix_type @property def backend(self): """ Backend. By default (auto), a GPU is used if available. Type: ``Literal["auto", "gpu", "cpu"]``, defaults to ``"auto"``. :examples: >>> pros = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv") >>> pros["CAPSULE"] = pros["CAPSULE"].asfactor() >>> pros_xgb = H2OXGBoostEstimator(tree_method="exact", ... seed=123, ... backend="cpu") >>> pros_xgb.train(y="CAPSULE", ... ignored_columns=["ID"], ... training_frame=pros) >>> pros_xgb.auc() """ return self._parms.get("backend") @backend.setter def backend(self, backend): assert_is_type(backend, None, Enum("auto", "gpu", "cpu")) self._parms["backend"] = backend @property def gpu_id(self): """ Which GPU(s) to use. Type: ``List[int]``. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(gpu_id=0, ... seed=1234) >>> boston_xgb.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> boston_xgb.mse() """ return self._parms.get("gpu_id") @gpu_id.setter def gpu_id(self, gpu_id): assert_is_type(gpu_id, None, int, [int]) self._parms["gpu_id"] = gpu_id @property def gainslift_bins(self): """ Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Type: ``int``, defaults to ``-1``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv") >>> model = H2OXGBoostEstimator(ntrees=1, gainslift_bins=20) >>> model.train(x=["Origin", "Distance"], ... y="IsDepDelayed", ... training_frame=airlines) >>> model.gains_lift() """ return self._parms.get("gainslift_bins") @gainslift_bins.setter def gainslift_bins(self, gainslift_bins): assert_is_type(gainslift_bins, None, int) self._parms["gainslift_bins"] = gainslift_bins @property def auc_type(self): """ Set default multinomial AUC type. Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to ``"auto"``. """ return self._parms.get("auc_type") @auc_type.setter def auc_type(self, auc_type): assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo")) self._parms["auc_type"] = auc_type @property def scale_pos_weight(self): """ Controls the effect of observations with positive labels in relation to the observations with negative labels on gradient calculation. Useful for imbalanced problems. Type: ``float``, defaults to ``1.0``. """ return self._parms.get("scale_pos_weight") @scale_pos_weight.setter def scale_pos_weight(self, scale_pos_weight): assert_is_type(scale_pos_weight, None, float) self._parms["scale_pos_weight"] = scale_pos_weight @property def eval_metric(self): """ Specification of evaluation metric that will be passed to the native XGBoost backend. Type: ``str``. """ return self._parms.get("eval_metric") @eval_metric.setter def eval_metric(self, eval_metric): assert_is_type(eval_metric, None, str) self._parms["eval_metric"] = eval_metric @property def score_eval_metric_only(self): """ If enabled, score only the evaluation metric. This can make model training faster if scoring is frequent (eg. each iteration). Type: ``bool``, defaults to ``False``. """ return self._parms.get("score_eval_metric_only") @score_eval_metric_only.setter def score_eval_metric_only(self, score_eval_metric_only): assert_is_type(score_eval_metric_only, None, bool) self._parms["score_eval_metric_only"] = score_eval_metric_only
[docs] @staticmethod def available(): """ Ask the H2O server whether a XGBoost model can be built (depends on availability of native backends). :return: True if a XGBoost model can be built, or False otherwise. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_xgb = H2OXGBoostEstimator(seed=1234) >>> boston_xgb.available() """ if "XGBoost" not in h2o.cluster().list_core_extensions(): print("Cannot build an XGBoost model - no backend found.") return False else: return True
[docs] def convert_H2OXGBoostParams_2_XGBoostParams(self): """ In order to use convert_H2OXGBoostParams_2_XGBoostParams and convert_H2OFrame_2_DMatrix, you must import the following toolboxes: xgboost, pandas, numpy and scipy.sparse. Given an H2OXGBoost model, this method will generate the corresponding parameters that should be used by native XGBoost in order to give exactly the same result, assuming that the same dataset (derived from h2oFrame) is used to train the native XGBoost model. Follow the steps below to compare H2OXGBoost and native XGBoost: 1. Train the H2OXGBoost model with H2OFrame trainFile and generate a prediction: - h2oModelD = H2OXGBoostEstimator(\*\*h2oParamsD) # parameters specified as a dict() - h2oModelD.train(x=myX, y=y, training_frame=trainFile) # train with H2OFrame trainFile - h2oPredict = h2oPredictD = h2oModelD.predict(trainFile) 2. Derive the DMatrix from H2OFrame: - nativeDMatrix = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) 3. Derive the parameters for native XGBoost: - nativeParams = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() 4. Train your native XGBoost model and generate a prediction: - nativeModel = xgb.train(params=nativeParams[0], dtrain=nativeDMatrix, num_boost_round=nativeParams[1]) - nativePredict = nativeModel.predict(data=nativeDMatrix, ntree_limit=nativeParams[1] 5. Compare the predictions h2oPredict from H2OXGBoost, nativePredict from native XGBoost. :return: nativeParams, num_boost_round """ import xgboost as xgb nativeParams = self._model_json["output"]["native_parameters"] nativeXGBoostParams = dict() for (a,keyname,keyvalue) in nativeParams.cell_values: nativeXGBoostParams[keyname]=keyvalue paramsSet = self.full_parameters return nativeXGBoostParams, paramsSet['ntrees']['actual_value']