Source code for h2o.estimators.gbm

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OGradientBoostingEstimator(H2OEstimator): """ Gradient Boosting Machine Builds gradient boosted trees on a parsed data set, for regression or classification. The default distribution function will guess the model type based on the response column type. Otherwise, the response column must be an enum for "bernoulli" or "multinomial", and numeric for all other distributions. """ algo = "gbm" supervised_learning = True _options_ = {'model_extensions': ['h2o.model.extensions.ScoringHistoryTrees', 'h2o.model.extensions.VariableImportance', 'h2o.model.extensions.FeatureInteraction', 'h2o.model.extensions.Trees', 'h2o.model.extensions.SupervisedTrees', 'h2o.model.extensions.HStatistic', 'h2o.model.extensions.Contributions', 'h2o.model.extensions.Fairness', 'h2o.model.extensions.RowToTreeAssignment'], 'verbose': True} def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] nfolds=0, # type: int keep_cross_validation_models=True, # type: bool keep_cross_validation_predictions=False, # type: bool keep_cross_validation_fold_assignment=False, # type: bool score_each_iteration=False, # type: bool score_tree_interval=0, # type: int fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"] fold_column=None, # type: Optional[str] response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] ignore_const_cols=True, # type: bool offset_column=None, # type: Optional[str] weights_column=None, # type: Optional[str] balance_classes=False, # type: bool class_sampling_factors=None, # type: Optional[List[float]] max_after_balance_size=5.0, # type: float max_confusion_matrix_size=20, # type: int ntrees=50, # type: int max_depth=5, # type: int min_rows=10.0, # type: float nbins=20, # type: int nbins_top_level=1024, # type: int nbins_cats=1024, # type: int r2_stopping=None, # type: Optional[float] stopping_rounds=0, # type: int stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] stopping_tolerance=0.001, # type: float max_runtime_secs=0.0, # type: float seed=-1, # type: int build_tree_one_node=False, # type: bool learn_rate=0.1, # type: float learn_rate_annealing=1.0, # type: float distribution="auto", # type: Literal["auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom"] quantile_alpha=0.5, # type: float tweedie_power=1.5, # type: float huber_alpha=0.9, # type: float checkpoint=None, # type: Optional[Union[None, str, H2OEstimator]] sample_rate=1.0, # type: float sample_rate_per_class=None, # type: Optional[List[float]] col_sample_rate=1.0, # type: float col_sample_rate_change_per_level=1.0, # type: float col_sample_rate_per_tree=1.0, # type: float min_split_improvement=1e-05, # type: float histogram_type="auto", # type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"] max_abs_leafnode_pred=None, # type: Optional[float] pred_noise_bandwidth=0.0, # type: float categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] calibrate_model=False, # type: bool calibration_frame=None, # type: Optional[Union[None, str, H2OFrame]] calibration_method="auto", # type: Literal["auto", "platt_scaling", "isotonic_regression"] custom_metric_func=None, # type: Optional[str] custom_distribution_func=None, # type: Optional[str] export_checkpoints_dir=None, # type: Optional[str] in_training_checkpoints_dir=None, # type: Optional[str] in_training_checkpoints_tree_interval=1, # type: int monotone_constraints=None, # type: Optional[dict] check_constant_response=True, # type: bool gainslift_bins=-1, # type: int auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] interaction_constraints=None, # type: Optional[List[List[str]]] auto_rebalance=True, # type: bool ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2). Defaults to ``0``. :type nfolds: int :param keep_cross_validation_models: Whether to keep the cross-validation models. Defaults to ``True``. :type keep_cross_validation_models: bool :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models. Defaults to ``False``. :type keep_cross_validation_predictions: bool :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment. Defaults to ``False``. :type keep_cross_validation_fold_assignment: bool :param score_each_iteration: Whether to score during each iteration of model training. Defaults to ``False``. :type score_each_iteration: bool :param score_tree_interval: Score the model after every so many trees. Disabled if set to 0. Defaults to ``0``. :type score_tree_interval: int :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Defaults to ``"auto"``. :type fold_assignment: Literal["auto", "random", "modulo", "stratified"] :param fold_column: Column with cross-validation fold index assignment per observation. Defaults to ``None``. :type fold_column: str, optional :param response_column: Response variable column. Defaults to ``None``. :type response_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool :param offset_column: Offset column. This will be added to the combination of columns before applying the link function. Defaults to ``None``. :type offset_column: str, optional :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Defaults to ``None``. :type weights_column: str, optional :param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to ``False``. :type balance_classes: bool :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. Defaults to ``None``. :type class_sampling_factors: List[float], optional :param max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Defaults to ``5.0``. :type max_after_balance_size: float :param max_confusion_matrix_size: [Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs Defaults to ``20``. :type max_confusion_matrix_size: int :param ntrees: Number of trees. Defaults to ``50``. :type ntrees: int :param max_depth: Maximum tree depth (0 for unlimited). Defaults to ``5``. :type max_depth: int :param min_rows: Fewest allowed (weighted) observations in a leaf. Defaults to ``10.0``. :type min_rows: float :param nbins: For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point Defaults to ``20``. :type nbins: int :param nbins_top_level: For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level Defaults to ``1024``. :type nbins_top_level: int :param nbins_cats: For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting. Defaults to ``1024``. :type nbins_cats: int :param r2_stopping: r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or exceeds this Defaults to ``∞``. :type r2_stopping: float :param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Defaults to ``0``. :type stopping_rounds: int :param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Defaults to ``"auto"``. :type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] :param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Defaults to ``0.001``. :type stopping_tolerance: float :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param seed: Seed for pseudo random number generator (if applicable) Defaults to ``-1``. :type seed: int :param build_tree_one_node: Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Defaults to ``False``. :type build_tree_one_node: bool :param learn_rate: Learning rate (from 0.0 to 1.0) Defaults to ``0.1``. :type learn_rate: float :param learn_rate_annealing: Scale the learning rate by this factor after each tree (e.g., 0.99 or 0.999) Defaults to ``1.0``. :type learn_rate_annealing: float :param distribution: Distribution function Defaults to ``"auto"``. :type distribution: Literal["auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom"] :param quantile_alpha: Desired quantile for Quantile regression, must be between 0 and 1. Defaults to ``0.5``. :type quantile_alpha: float :param tweedie_power: Tweedie power for Tweedie regression, must be between 1 and 2. Defaults to ``1.5``. :type tweedie_power: float :param huber_alpha: Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1). Defaults to ``0.9``. :type huber_alpha: float :param checkpoint: Model checkpoint to resume training with. Defaults to ``None``. :type checkpoint: Union[None, str, H2OEstimator], optional :param sample_rate: Row sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type sample_rate: float :param sample_rate_per_class: A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree Defaults to ``None``. :type sample_rate_per_class: List[float], optional :param col_sample_rate: Column sample rate (from 0.0 to 1.0) Defaults to ``1.0``. :type col_sample_rate: float :param col_sample_rate_change_per_level: Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0) Defaults to ``1.0``. :type col_sample_rate_change_per_level: float :param col_sample_rate_per_tree: Column sample rate per tree (from 0.0 to 1.0) Defaults to ``1.0``. :type col_sample_rate_per_tree: float :param min_split_improvement: Minimum relative improvement in squared error reduction for a split to happen Defaults to ``1e-05``. :type min_split_improvement: float :param histogram_type: What type of histogram to use for finding optimal split points Defaults to ``"auto"``. :type histogram_type: Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"] :param max_abs_leafnode_pred: Maximum absolute value of a leaf node prediction Defaults to ``∞``. :type max_abs_leafnode_pred: float :param pred_noise_bandwidth: Bandwidth (sigma) of Gaussian multiplicative noise ~N(1,sigma) for tree node predictions Defaults to ``0.0``. :type pred_noise_bandwidth: float :param categorical_encoding: Encoding scheme for categorical features Defaults to ``"auto"``. :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] :param calibrate_model: Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Defaults to ``False``. :type calibrate_model: bool :param calibration_frame: Data for model calibration Defaults to ``None``. :type calibration_frame: Union[None, str, H2OFrame], optional :param calibration_method: Calibration method to use Defaults to ``"auto"``. :type calibration_method: Literal["auto", "platt_scaling", "isotonic_regression"] :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName` Defaults to ``None``. :type custom_metric_func: str, optional :param custom_distribution_func: Reference to custom distribution, format: `language:keyName=funcName` Defaults to ``None``. :type custom_distribution_func: str, optional :param export_checkpoints_dir: Automatically export generated models to this directory. Defaults to ``None``. :type export_checkpoints_dir: str, optional :param in_training_checkpoints_dir: Create checkpoints into defined directory while training process is still running. In case of cluster shutdown, this checkpoint can be used to restart training. Defaults to ``None``. :type in_training_checkpoints_dir: str, optional :param in_training_checkpoints_tree_interval: Checkpoint the model after every so many trees. Parameter is used only when in_training_checkpoints_dir is defined Defaults to ``1``. :type in_training_checkpoints_tree_interval: int :param monotone_constraints: A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. Defaults to ``None``. :type monotone_constraints: dict, optional :param check_constant_response: Check if response column is constant. If enabled, then an exception is thrown if the response column is a constant value.If disabled, then model will train regardless of the response column being a constant value or not. Defaults to ``True``. :type check_constant_response: bool :param gainslift_bins: Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Defaults to ``-1``. :type gainslift_bins: int :param auc_type: Set default multinomial AUC type. Defaults to ``"auto"``. :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] :param interaction_constraints: A set of allowed column interactions. Defaults to ``None``. :type interaction_constraints: List[List[str]], optional :param auto_rebalance: Allow automatic rebalancing of training and validation datasets Defaults to ``True``. :type auto_rebalance: bool """ super(H2OGradientBoostingEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.nfolds = nfolds self.keep_cross_validation_models = keep_cross_validation_models self.keep_cross_validation_predictions = keep_cross_validation_predictions self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment self.score_each_iteration = score_each_iteration self.score_tree_interval = score_tree_interval self.fold_assignment = fold_assignment self.fold_column = fold_column self.response_column = response_column self.ignored_columns = ignored_columns self.ignore_const_cols = ignore_const_cols self.offset_column = offset_column self.weights_column = weights_column self.balance_classes = balance_classes self.class_sampling_factors = class_sampling_factors self.max_after_balance_size = max_after_balance_size self.max_confusion_matrix_size = max_confusion_matrix_size self.ntrees = ntrees self.max_depth = max_depth self.min_rows = min_rows self.nbins = nbins self.nbins_top_level = nbins_top_level self.nbins_cats = nbins_cats self.r2_stopping = r2_stopping self.stopping_rounds = stopping_rounds self.stopping_metric = stopping_metric self.stopping_tolerance = stopping_tolerance self.max_runtime_secs = max_runtime_secs self.seed = seed self.build_tree_one_node = build_tree_one_node self.learn_rate = learn_rate self.learn_rate_annealing = learn_rate_annealing self.distribution = distribution self.quantile_alpha = quantile_alpha self.tweedie_power = tweedie_power self.huber_alpha = huber_alpha self.checkpoint = checkpoint self.sample_rate = sample_rate self.sample_rate_per_class = sample_rate_per_class self.col_sample_rate = col_sample_rate self.col_sample_rate_change_per_level = col_sample_rate_change_per_level self.col_sample_rate_per_tree = col_sample_rate_per_tree self.min_split_improvement = min_split_improvement self.histogram_type = histogram_type self.max_abs_leafnode_pred = max_abs_leafnode_pred self.pred_noise_bandwidth = pred_noise_bandwidth self.categorical_encoding = categorical_encoding self.calibrate_model = calibrate_model self.calibration_frame = calibration_frame self.calibration_method = calibration_method self.custom_metric_func = custom_metric_func self.custom_distribution_func = custom_distribution_func self.export_checkpoints_dir = export_checkpoints_dir self.in_training_checkpoints_dir = in_training_checkpoints_dir self.in_training_checkpoints_tree_interval = in_training_checkpoints_tree_interval self.monotone_constraints = monotone_constraints self.check_constant_response = check_constant_response self.gainslift_bins = gainslift_bins self.auc_type = auc_type self.interaction_constraints = interaction_constraints self.auto_rebalance = auto_rebalance @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def nfolds(self): """ Number of folds for K-fold cross-validation (0 to disable or >= 2). Type: ``int``, defaults to ``0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> folds = 5 >>> cars_gbm = H2OGradientBoostingEstimator(nfolds=folds, ... seed=1234 >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=cars) >>> cars_gbm.auc() """ return self._parms.get("nfolds") @nfolds.setter def nfolds(self, nfolds): assert_is_type(nfolds, None, int) self._parms["nfolds"] = nfolds @property def keep_cross_validation_models(self): """ Whether to keep the cross-validation models. Type: ``bool``, defaults to ``True``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> folds = 5 >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_models=True, ... nfolds=5, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc() """ return self._parms.get("keep_cross_validation_models") @keep_cross_validation_models.setter def keep_cross_validation_models(self, keep_cross_validation_models): assert_is_type(keep_cross_validation_models, None, bool) self._parms["keep_cross_validation_models"] = keep_cross_validation_models @property def keep_cross_validation_predictions(self): """ Whether to keep the predictions of the cross-validation models. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> folds = 5 >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_predictions=True, ... nfolds=5, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc() """ return self._parms.get("keep_cross_validation_predictions") @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, keep_cross_validation_predictions): assert_is_type(keep_cross_validation_predictions, None, bool) self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions @property def keep_cross_validation_fold_assignment(self): """ Whether to keep the cross-validation fold assignment. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> folds = 5 >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(keep_cross_validation_fold_assignment=True, ... nfolds=5, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc() """ return self._parms.get("keep_cross_validation_fold_assignment") @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment): assert_is_type(keep_cross_validation_fold_assignment, None, bool) self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(score_each_iteration=True, ... ntrees=55, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.scoring_history() """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def score_tree_interval(self): """ Score the model after every so many trees. Disabled if set to 0. Type: ``int``, defaults to ``0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], ... seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(score_tree_interval=True, ... ntrees=55, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.scoring_history() """ return self._parms.get("score_tree_interval") @score_tree_interval.setter def score_tree_interval(self, score_tree_interval): assert_is_type(score_tree_interval, None, int) self._parms["score_tree_interval"] = score_tree_interval @property def fold_assignment(self): """ Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> assignment_type = "Random" >>> cars_gbm = H2OGradientBoostingEstimator(fold_assignment=assignment_type, ... nfolds=5, ... seed=1234) >>> cars_gbm.train(x=predictors, y=response, training_frame=cars) >>> cars_gbm.auc(xval=True) """ return self._parms.get("fold_assignment") @fold_assignment.setter def fold_assignment(self, fold_assignment): assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified")) self._parms["fold_assignment"] = fold_assignment @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> fold_numbers = cars.kfold_column(n_folds=5, ... seed=1234) >>> fold_numbers.set_names(["fold_numbers"]) >>> cars = cars.cbind(fold_numbers) >>> cars_gbm = H2OGradientBoostingEstimator(seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=cars, ... fold_column="fold_numbers") >>> cars_gbm.auc(xval=True) """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool``, defaults to ``True``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> cars["const_1"] = 6 >>> cars["const_2"] = 7 >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed=1234, ... ignore_const_cols=True) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def offset_column(self): """ Offset column. This will be added to the combination of columns before applying the link function. Type: ``str``. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> boston["offset"] = boston["medv"].log() >>> train, valid = boston.split_frame(ratios=[.8], seed=1234) >>> boston_gbm = H2OGradientBoostingEstimator(offset_column="offset", ... seed=1234) >>> boston_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> boston_gbm.mse(valid=True) """ return self._parms.get("offset_column") @offset_column.setter def offset_column(self, offset_column): assert_is_type(offset_column, None, str) self._parms["offset_column"] = offset_column @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Type: ``str``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid, ... weights_column="weight") >>> cars_gbm.auc(valid=True) """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def balance_classes(self): """ Balance training data class counts via over/under-sampling (for imbalanced data). Type: ``bool``, defaults to ``False``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True, ... seed=1234) >>> cov_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cov_gbm.logloss(valid=True) """ return self._parms.get("balance_classes") @balance_classes.setter def balance_classes(self, balance_classes): assert_is_type(balance_classes, None, bool) self._parms["balance_classes"] = balance_classes @property def class_sampling_factors(self): """ Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. Type: ``List[float]``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> sample_factors = [1., 0.5, 1., 1., 1., 1., 1.] >>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True, ... class_sampling_factors=sample_factors, ... seed=1234) >>> cov_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cov_gbm.logloss(valid=True) """ return self._parms.get("class_sampling_factors") @class_sampling_factors.setter def class_sampling_factors(self, class_sampling_factors): assert_is_type(class_sampling_factors, None, [float]) self._parms["class_sampling_factors"] = class_sampling_factors @property def max_after_balance_size(self): """ Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Type: ``float``, defaults to ``5.0``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> max = .85 >>> cov_gbm = H2OGradientBoostingEstimator(balance_classes=True, ... max_after_balance_size=max, ... seed=1234) >>> cov_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cov_gbm.logloss(valid=True) """ return self._parms.get("max_after_balance_size") @max_after_balance_size.setter def max_after_balance_size(self, max_after_balance_size): assert_is_type(max_after_balance_size, None, float) self._parms["max_after_balance_size"] = max_after_balance_size @property def max_confusion_matrix_size(self): """ [Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs Type: ``int``, defaults to ``20``. """ return self._parms.get("max_confusion_matrix_size") @max_confusion_matrix_size.setter def max_confusion_matrix_size(self, max_confusion_matrix_size): assert_is_type(max_confusion_matrix_size, None, int) self._parms["max_confusion_matrix_size"] = max_confusion_matrix_size @property def ntrees(self): """ Number of trees. Type: ``int``, defaults to ``50``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> tree_num = [20, 50, 80, 110, 140, 170, 200] >>> label = ["20", "50", "80", "110", "140", "170", "200"] >>> for key, num in enumerate(tree_num): ... titanic_gbm = H2OGradientBoostingEstimator(ntrees=num, ... seed=1234) ... titanic_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) ... print(label[key], 'training score', titanic_gbm.auc(train=True)) ... print(label[key], 'validation score', titanic_gbm.auc(valid=True)) """ return self._parms.get("ntrees") @ntrees.setter def ntrees(self, ntrees): assert_is_type(ntrees, None, int) self._parms["ntrees"] = ntrees @property def max_depth(self): """ Maximum tree depth (0 for unlimited). Type: ``int``, defaults to ``5``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(ntrees=100, ... max_depth=2, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("max_depth") @max_depth.setter def max_depth(self, max_depth): assert_is_type(max_depth, None, int) self._parms["max_depth"] = max_depth @property def min_rows(self): """ Fewest allowed (weighted) observations in a leaf. Type: ``float``, defaults to ``10.0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(min_rows=16, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("min_rows") @min_rows.setter def min_rows(self, min_rows): assert_is_type(min_rows, None, numeric) self._parms["min_rows"] = min_rows @property def nbins(self): """ For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point Type: ``int``, defaults to ``20``. :examples: >>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv") >>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor() >>> predictors = eeg.columns[:-1] >>> response = 'eyeDetection' >>> train, valid = eeg.split_frame(ratios=[.8], seed=1234) >>> bin_num = [16, 32, 64, 128, 256, 512] >>> label = ["16", "32", "64", "128", "256", "512"] >>> for key, num in enumerate(bin_num): ... eeg_gbm = H2OGradientBoostingEstimator(nbins=num, seed=1234) ... eeg_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) ... print(label[key], 'training score', eeg_gbm.auc(train=True)) ... print(label[key], 'validation score', eeg_gbm.auc(valid=True)) """ return self._parms.get("nbins") @nbins.setter def nbins(self, nbins): assert_is_type(nbins, None, int) self._parms["nbins"] = nbins @property def nbins_top_level(self): """ For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level Type: ``int``, defaults to ``1024``. :examples: >>> eeg = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/eeg/eeg_eyestate.csv") >>> eeg['eyeDetection'] = eeg['eyeDetection'].asfactor() >>> predictors = eeg.columns[:-1] >>> response = 'eyeDetection' >>> train, valid = eeg.split_frame(ratios=[.8], seed=1234) >>> bin_num = [32, 64, 128, 256, 512, 1024, 2048, 4096] >>> label = ["32", "64", "128", "256", "512", "1024", "2048", "4096"] >>> for key, num in enumerate(bin_num): ... eeg_gbm = H2OGradientBoostingEstimator(nbins_top_level=num, seed=1234) ... eeg_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) ... print(label[key], 'training score', eeg_gbm.auc(train=True)) ... print(label[key], 'validation score', eeg_gbm.auc(valid=True)) """ return self._parms.get("nbins_top_level") @nbins_top_level.setter def nbins_top_level(self, nbins_top_level): assert_is_type(nbins_top_level, None, int) self._parms["nbins_top_level"] = nbins_top_level @property def nbins_cats(self): """ For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting. Type: ``int``, defaults to ``1024``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> bin_num = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096] >>> label = ["8", "16", "32", "64", "128", "256", "512", "1024", "2048", "4096"] >>> for key, num in enumerate(bin_num): ... airlines_gbm = H2OGradientBoostingEstimator(nbins_cats=num, seed=1234) ... airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) ... print(label[key], 'training score', airlines_gbm.auc(train=True)) ... print(label[key], 'validation score', airlines_gbm.auc(valid=True)) """ return self._parms.get("nbins_cats") @nbins_cats.setter def nbins_cats(self, nbins_cats): assert_is_type(nbins_cats, None, int) self._parms["nbins_cats"] = nbins_cats @property def r2_stopping(self): """ r2_stopping is no longer supported and will be ignored if set - please use stopping_rounds, stopping_metric and stopping_tolerance instead. Previous version of H2O would stop making trees when the R^2 metric equals or exceeds this Type: ``float``, defaults to ``∞``. """ return self._parms.get("r2_stopping") @r2_stopping.setter def r2_stopping(self, r2_stopping): assert_is_type(r2_stopping, None, numeric) self._parms["r2_stopping"] = r2_stopping @property def stopping_rounds(self): """ Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Type: ``int``, defaults to ``0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("stopping_rounds") @stopping_rounds.setter def stopping_rounds(self, stopping_rounds): assert_is_type(stopping_rounds, None, int) self._parms["stopping_rounds"] = stopping_rounds @property def stopping_metric(self): """ Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("stopping_metric") @stopping_metric.setter def stopping_metric(self, stopping_metric): assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing")) self._parms["stopping_metric"] = stopping_metric @property def stopping_tolerance(self): """ Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Type: ``float``, defaults to ``0.001``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid= airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(stopping_metric="auc", ... stopping_rounds=3, ... stopping_tolerance=1e-2, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("stopping_tolerance") @stopping_tolerance.setter def stopping_tolerance(self, stopping_tolerance): assert_is_type(stopping_tolerance, None, numeric) self._parms["stopping_tolerance"] = stopping_tolerance @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(max_runtime_secs=10, ... ntrees=10000, ... max_depth=10, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def seed(self): """ Seed for pseudo random number generator (if applicable) Type: ``int``, defaults to ``-1``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> gbm_w_seed_1 = H2OGradientBoostingEstimator(col_sample_rate=.7, ... seed=1234) >>> gbm_w_seed_1.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print('auc for the 1st model built with a seed:', gbm_w_seed_1.auc(valid=True)) """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def build_tree_one_node(self): """ Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets. Type: ``bool``, defaults to ``False``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(build_tree_one_node=True, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("build_tree_one_node") @build_tree_one_node.setter def build_tree_one_node(self, build_tree_one_node): assert_is_type(build_tree_one_node, None, bool) self._parms["build_tree_one_node"] = build_tree_one_node @property def learn_rate(self): """ Learning rate (from 0.0 to 1.0) Type: ``float``, defaults to ``0.1``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000, ... learn_rate=0.01, ... stopping_rounds=5, ... stopping_metric="AUC", ... stopping_tolerance=1e-4, ... seed=1234) >>> titanic_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_gbm.auc(valid=True) """ return self._parms.get("learn_rate") @learn_rate.setter def learn_rate(self, learn_rate): assert_is_type(learn_rate, None, numeric) self._parms["learn_rate"] = learn_rate @property def learn_rate_annealing(self): """ Scale the learning rate by this factor after each tree (e.g., 0.99 or 0.999) Type: ``float``, defaults to ``1.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_gbm = H2OGradientBoostingEstimator(ntrees=10000, ... learn_rate=0.05, ... learn_rate_annealing=.9, ... stopping_rounds=5, ... stopping_metric="AUC", ... stopping_tolerance=1e-4, ... seed=1234) >>> titanic_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_gbm.auc(valid=True) """ return self._parms.get("learn_rate_annealing") @learn_rate_annealing.setter def learn_rate_annealing(self, learn_rate_annealing): assert_is_type(learn_rate_annealing, None, numeric) self._parms["learn_rate_annealing"] = learn_rate_annealing @property def distribution(self): """ Distribution function Type: ``Literal["auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom"]``, defaults to ``"auto"``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> response = "cylinders" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(distribution="poisson", ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.mse(valid=True) """ return self._parms.get("distribution") @distribution.setter def distribution(self, distribution): assert_is_type(distribution, None, Enum("auto", "bernoulli", "quasibinomial", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber", "custom")) self._parms["distribution"] = distribution @property def quantile_alpha(self): """ Desired quantile for Quantile regression, must be between 0 and 1. Type: ``float``, defaults to ``0.5``. :examples: >>> boston = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/BostonHousing.csv") >>> predictors = boston.columns[:-1] >>> response = "medv" >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8], seed=1234) >>> boston_gbm = H2OGradientBoostingEstimator(distribution="quantile", ... quantile_alpha=.8, ... seed=1234) >>> boston_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> boston_gbm.mse(valid=True) """ return self._parms.get("quantile_alpha") @quantile_alpha.setter def quantile_alpha(self, quantile_alpha): assert_is_type(quantile_alpha, None, numeric) self._parms["quantile_alpha"] = quantile_alpha @property def tweedie_power(self): """ Tweedie power for Tweedie regression, must be between 1 and 2. Type: ``float``, defaults to ``1.5``. :examples: >>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv") >>> predictors = insurance.columns[0:4] >>> response = 'Claims' >>> insurance['Group'] = insurance['Group'].asfactor() >>> insurance['Age'] = insurance['Age'].asfactor() >>> train, valid = insurance.split_frame(ratios=[.8], seed=1234) >>> insurance_gbm = H2OGradientBoostingEstimator(distribution="tweedie", ... tweedie_power=1.2, ... seed=1234) >>> insurance_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> insurance_gbm.mse(valid=True) """ return self._parms.get("tweedie_power") @tweedie_power.setter def tweedie_power(self, tweedie_power): assert_is_type(tweedie_power, None, numeric) self._parms["tweedie_power"] = tweedie_power @property def huber_alpha(self): """ Desired quantile for Huber/M-regression (threshold between quadratic and linear loss, must be between 0 and 1). Type: ``float``, defaults to ``0.9``. :examples: >>> insurance = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/insurance.csv") >>> predictors = insurance.columns[0:4] >>> response = 'Claims' >>> insurance['Group'] = insurance['Group'].asfactor() >>> insurance['Age'] = insurance['Age'].asfactor() >>> train, valid = insurance.split_frame(ratios=[.8], seed=1234) >>> insurance_gbm = H2OGradientBoostingEstimator(distribution="huber", ... huber_alpha=0.9, ... seed=1234) >>> insurance_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> insurance_gbm.mse(valid=True) """ return self._parms.get("huber_alpha") @huber_alpha.setter def huber_alpha(self, huber_alpha): assert_is_type(huber_alpha, None, numeric) self._parms["huber_alpha"] = huber_alpha @property def checkpoint(self): """ Model checkpoint to resume training with. Type: ``Union[None, str, H2OEstimator]``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(ntrees=1, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> print(cars_gbm.auc(valid=True)) >>> print("Number of trees built for cars_gbm model:", cars_gbm.ntrees) >>> cars_gbm_continued = H2OGradientBoostingEstimator(checkpoint=cars_gbm.model_id, ... ntrees=50, ... seed=1234) >>> cars_gbm_continued.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm_continued.auc(valid=True) >>> print("Number of trees built for cars_gbm model:",cars_gbm_continued.ntrees) """ return self._parms.get("checkpoint") @checkpoint.setter def checkpoint(self, checkpoint): assert_is_type(checkpoint, None, str, H2OEstimator) self._parms["checkpoint"] = checkpoint @property def sample_rate(self): """ Row sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["Year"]= airlines["Year"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(sample_rate=.7, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("sample_rate") @sample_rate.setter def sample_rate(self, sample_rate): assert_is_type(sample_rate, None, numeric) self._parms["sample_rate"] = sample_rate @property def sample_rate_per_class(self): """ A list of row sample rates per class (relative fraction for each class, from 0.0 to 1.0), for each tree Type: ``List[float]``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> rate_per_class_list = [1, .4, 1, 1, 1, 1, 1] >>> cov_gbm = H2OGradientBoostingEstimator(sample_rate_per_class=rate_per_class_list, ... seed=1234) >>> cov_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cov_gbm.logloss(valid=True) """ return self._parms.get("sample_rate_per_class") @sample_rate_per_class.setter def sample_rate_per_class(self, sample_rate_per_class): assert_is_type(sample_rate_per_class, None, [numeric]) self._parms["sample_rate_per_class"] = sample_rate_per_class @property def col_sample_rate(self): """ Column sample rate (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate=.7, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("col_sample_rate") @col_sample_rate.setter def col_sample_rate(self, col_sample_rate): assert_is_type(col_sample_rate, None, numeric) self._parms["col_sample_rate"] = col_sample_rate @property def col_sample_rate_change_per_level(self): """ Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_change_per_level=.9, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("col_sample_rate_change_per_level") @col_sample_rate_change_per_level.setter def col_sample_rate_change_per_level(self, col_sample_rate_change_per_level): assert_is_type(col_sample_rate_change_per_level, None, numeric) self._parms["col_sample_rate_change_per_level"] = col_sample_rate_change_per_level @property def col_sample_rate_per_tree(self): """ Column sample rate per tree (from 0.0 to 1.0) Type: ``float``, defaults to ``1.0``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(col_sample_rate_per_tree=.7, ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("col_sample_rate_per_tree") @col_sample_rate_per_tree.setter def col_sample_rate_per_tree(self, col_sample_rate_per_tree): assert_is_type(col_sample_rate_per_tree, None, numeric) self._parms["col_sample_rate_per_tree"] = col_sample_rate_per_tree @property def min_split_improvement(self): """ Minimum relative improvement in squared error reduction for a split to happen Type: ``float``, defaults to ``1e-05``. :examples: >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv") >>> cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() >>> predictors = ["displacement","power","weight","acceleration","year"] >>> response = "economy_20mpg" >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_gbm = H2OGradientBoostingEstimator(min_split_improvement=1e-3, ... seed=1234) >>> cars_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cars_gbm.auc(valid=True) """ return self._parms.get("min_split_improvement") @min_split_improvement.setter def min_split_improvement(self, min_split_improvement): assert_is_type(min_split_improvement, None, numeric) self._parms["min_split_improvement"] = min_split_improvement @property def histogram_type(self): """ What type of histogram to use for finding optimal split points Type: ``Literal["auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(histogram_type="UniformAdaptive", ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("histogram_type") @histogram_type.setter def histogram_type(self, histogram_type): assert_is_type(histogram_type, None, Enum("auto", "uniform_adaptive", "random", "quantiles_global", "round_robin", "uniform_robust")) self._parms["histogram_type"] = histogram_type @property def max_abs_leafnode_pred(self): """ Maximum absolute value of a leaf node prediction Type: ``float``, defaults to ``∞``. :examples: >>> covtype = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/covtype/covtype.20k.data") >>> covtype[54] = covtype[54].asfactor() >>> predictors = covtype.columns[0:54] >>> response = 'C55' >>> train, valid = covtype.split_frame(ratios=[.8], seed=1234) >>> cov_gbm = H2OGradientBoostingEstimator(max_abs_leafnode_pred=2, ... seed=1234) >>> cov_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> cov_gbm.logloss(valid=True) """ return self._parms.get("max_abs_leafnode_pred") @max_abs_leafnode_pred.setter def max_abs_leafnode_pred(self, max_abs_leafnode_pred): assert_is_type(max_abs_leafnode_pred, None, numeric) self._parms["max_abs_leafnode_pred"] = max_abs_leafnode_pred @property def pred_noise_bandwidth(self): """ Bandwidth (sigma) of Gaussian multiplicative noise ~N(1,sigma) for tree node predictions Type: ``float``, defaults to ``0.0``. :examples: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic['survived'] = titanic['survived'].asfactor() >>> predictors = titanic.columns >>> del predictors[1:3] >>> response = 'survived' >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> titanic_gbm = H2OGradientBoostingEstimator(pred_noise_bandwidth=0.1, ... seed=1234) >>> titanic_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_gbm.auc(valid = True) """ return self._parms.get("pred_noise_bandwidth") @pred_noise_bandwidth.setter def pred_noise_bandwidth(self, pred_noise_bandwidth): assert_is_type(pred_noise_bandwidth, None, numeric) self._parms["pred_noise_bandwidth"] = pred_noise_bandwidth @property def categorical_encoding(self): """ Encoding scheme for categorical features Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(categorical_encoding="labelencoder", ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc(valid=True) """ return self._parms.get("categorical_encoding") @categorical_encoding.setter def categorical_encoding(self, categorical_encoding): assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) self._parms["categorical_encoding"] = categorical_encoding @property def calibrate_model(self): """ Use Platt Scaling (default) or Isotonic Regression to calculate calibrated class probabilities. Calibration can provide more accurate estimates of class probabilities. Type: ``bool``, defaults to ``False``. :examples: >>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv") >>> ecology['Angaus'] = ecology['Angaus'].asfactor() >>> response = 'Angaus' >>> train, calib = ecology.split_frame(seed = 12354) >>> predictors = ecology.columns[3:13] >>> w = h2o.create_frame(binary_fraction=1, ... binary_ones_fraction=0.5, ... missing_fraction=0, ... rows=744, cols=1) >>> w.set_names(["weight"]) >>> train = train.cbind(w) >>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10, ... max_depth=5, ... min_rows=10, ... learn_rate=0.1, ... distribution="multinomial", ... weights_column="weight", ... calibrate_model=True, ... calibration_frame=calib) >>> ecology_gbm.train(x=predictors, ... y="Angaus", ... training_frame=train) >>> ecology_gbm.auc() """ return self._parms.get("calibrate_model") @calibrate_model.setter def calibrate_model(self, calibrate_model): assert_is_type(calibrate_model, None, bool) self._parms["calibrate_model"] = calibrate_model @property def calibration_frame(self): """ Data for model calibration Type: ``Union[None, str, H2OFrame]``. :examples: >>> ecology = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/ecology_model.csv") >>> ecology['Angaus'] = ecology['Angaus'].asfactor() >>> response = 'Angaus' >>> predictors = ecology.columns[3:13] >>> train, calib = ecology.split_frame(seed=12354) >>> w = h2o.create_frame(binary_fraction=1, ... binary_ones_fraction=0.5, ... missing_fraction=0, ... rows=744,cols=1) >>> w.set_names(["weight"]) >>> train = train.cbind(w) >>> ecology_gbm = H2OGradientBoostingEstimator(ntrees=10, ... max_depth=5, ... min_rows=10, ... learn_rate=0.1, ... distribution="multinomial", ... calibrate_model=True, ... calibration_frame=calib) >>> ecology_gbm.train(x=predictors, ... y="Angaus", ... training_frame=train, ... weights_column="weight") >>> ecology_gbm.auc() """ return self._parms.get("calibration_frame") @calibration_frame.setter def calibration_frame(self, calibration_frame): self._parms["calibration_frame"] = H2OFrame._validate(calibration_frame, 'calibration_frame') @property def calibration_method(self): """ Calibration method to use Type: ``Literal["auto", "platt_scaling", "isotonic_regression"]``, defaults to ``"auto"``. """ return self._parms.get("calibration_method") @calibration_method.setter def calibration_method(self, calibration_method): assert_is_type(calibration_method, None, Enum("auto", "platt_scaling", "isotonic_regression")) self._parms["calibration_method"] = calibration_method @property def custom_metric_func(self): """ Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. """ return self._parms.get("custom_metric_func") @custom_metric_func.setter def custom_metric_func(self, custom_metric_func): assert_is_type(custom_metric_func, None, str) self._parms["custom_metric_func"] = custom_metric_func @property def custom_distribution_func(self): """ Reference to custom distribution, format: `language:keyName=funcName` Type: ``str``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") >>> airlines["Year"] = airlines["Year"].asfactor() >>> airlines["Month"] = airlines["Month"].asfactor() >>> airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor() >>> airlines["Cancelled"] = airlines["Cancelled"].asfactor() >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor() >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier", ... "DayOfWeek", "Month", "Distance", "FlightNum"] >>> response = "IsDepDelayed" >>> train, valid = airlines.split_frame(ratios=[.8], seed=1234) >>> airlines_gbm = H2OGradientBoostingEstimator(ntrees=3, ... max_depth=5, ... distribution="bernoulli", ... seed=1234) >>> airlines_gbm.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame valid) >>> from h2o.utils.distributions import CustomDistributionBernoulli >>> custom_distribution_bernoulli = h2o.upload_custom_distribution(CustomDistributionBernoulli, ... func_name="custom_bernoulli", ... func_file="custom_bernoulli.py") >>> airlines_gbm_custom = H2OGradientBoostingEstimator(ntrees=3, ... max_depth=5, ... distribution="custom", ... custom_distribution_func=custom_distribution_bernoulli, ... seed=1235) >>> airlines_gbm_custom.train(x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> airlines_gbm.auc() """ return self._parms.get("custom_distribution_func") @custom_distribution_func.setter def custom_distribution_func(self, custom_distribution_func): assert_is_type(custom_distribution_func, None, str) self._parms["custom_distribution_func"] = custom_distribution_func @property def export_checkpoints_dir(self): """ Automatically export generated models to this directory. Type: ``str``. :examples: >>> airlines = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip", destination_frame="air.hex") >>> predictors = ["DayofMonth", "DayOfWeek"] >>> response = "IsDepDelayed" >>> hyper_parameters = {'ntrees': [5,10]} >>> search_crit = {'strategy': "RandomDiscrete", ... 'max_models': 5, ... 'seed': 1234, ... 'stopping_rounds': 3, ... 'stopping_metric': "AUTO", ... 'stopping_tolerance': 1e-2} >>> checkpoints_dir = tempfile.mkdtemp() >>> air_grid = H2OGridSearch(H2OGradientBoostingEstimator, ... hyper_params=hyper_parameters, ... search_criteria=search_crit) >>> air_grid.train(x=predictors, ... y=response, ... training_frame=airlines, ... distribution="bernoulli", ... learn_rate=0.1, ... max_depth=3, ... export_checkpoints_dir=checkpoints_dir) >>> len(listdir(checkpoints_dir)) """ return self._parms.get("export_checkpoints_dir") @export_checkpoints_dir.setter def export_checkpoints_dir(self, export_checkpoints_dir): assert_is_type(export_checkpoints_dir, None, str) self._parms["export_checkpoints_dir"] = export_checkpoints_dir @property def in_training_checkpoints_dir(self): """ Create checkpoints into defined directory while training process is still running. In case of cluster shutdown, this checkpoint can be used to restart training. Type: ``str``. """ return self._parms.get("in_training_checkpoints_dir") @in_training_checkpoints_dir.setter def in_training_checkpoints_dir(self, in_training_checkpoints_dir): assert_is_type(in_training_checkpoints_dir, None, str) self._parms["in_training_checkpoints_dir"] = in_training_checkpoints_dir @property def in_training_checkpoints_tree_interval(self): """ Checkpoint the model after every so many trees. Parameter is used only when in_training_checkpoints_dir is defined Type: ``int``, defaults to ``1``. """ return self._parms.get("in_training_checkpoints_tree_interval") @in_training_checkpoints_tree_interval.setter def in_training_checkpoints_tree_interval(self, in_training_checkpoints_tree_interval): assert_is_type(in_training_checkpoints_tree_interval, None, int) self._parms["in_training_checkpoints_tree_interval"] = in_training_checkpoints_tree_interval @property def monotone_constraints(self): """ A mapping representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. Type: ``dict``. :examples: >>> prostate_hex = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip") >>> prostate_hex["CAPSULE"] = prostate_hex["CAPSULE"].asfactor() >>> response = "CAPSULE" >>> seed = 42 >>> monotone_constraints = {"AGE":1} >>> gbm_model = H2OGradientBoostingEstimator(seed=seed, ... monotone_constraints=monotone_constraints) >>> gbm_model.train(y=response, ... ignored_columns=["ID"], ... training_frame=prostate_hex) >>> gbm_model.scoring_history() """ return self._parms.get("monotone_constraints") @monotone_constraints.setter def monotone_constraints(self, monotone_constraints): assert_is_type(monotone_constraints, None, dict) self._parms["monotone_constraints"] = monotone_constraints @property def check_constant_response(self): """ Check if response column is constant. If enabled, then an exception is thrown if the response column is a constant value.If disabled, then model will train regardless of the response column being a constant value or not. Type: ``bool``, defaults to ``True``. :examples: >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv") >>> train["constantCol"] = 1 >>> my_gbm = H2OGradientBoostingEstimator(check_constant_response=False) >>> my_gbm.train(x=list(range(1,5)), ... y="constantCol", ... training_frame=train) """ return self._parms.get("check_constant_response") @check_constant_response.setter def check_constant_response(self, check_constant_response): assert_is_type(check_constant_response, None, bool) self._parms["check_constant_response"] = check_constant_response @property def gainslift_bins(self): """ Gains/Lift table number of bins. 0 means disabled.. Default value -1 means automatic binning. Type: ``int``, defaults to ``-1``. :examples: >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/testng/airlines_train.csv") >>> model = H2OGradientBoostingEstimator(ntrees=1, gainslift_bins=20) >>> model.train(x=["Origin", "Distance"], ... y="IsDepDelayed", ... training_frame=airlines) >>> model.gains_lift() """ return self._parms.get("gainslift_bins") @gainslift_bins.setter def gainslift_bins(self, gainslift_bins): assert_is_type(gainslift_bins, None, int) self._parms["gainslift_bins"] = gainslift_bins @property def auc_type(self): """ Set default multinomial AUC type. Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to ``"auto"``. """ return self._parms.get("auc_type") @auc_type.setter def auc_type(self, auc_type): assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo")) self._parms["auc_type"] = auc_type @property def interaction_constraints(self): """ A set of allowed column interactions. Type: ``List[List[str]]``. """ return self._parms.get("interaction_constraints") @interaction_constraints.setter def interaction_constraints(self, interaction_constraints): assert_is_type(interaction_constraints, None, [[str]]) self._parms["interaction_constraints"] = interaction_constraints @property def auto_rebalance(self): """ Allow automatic rebalancing of training and validation datasets Type: ``bool``, defaults to ``True``. """ return self._parms.get("auto_rebalance") @auto_rebalance.setter def auto_rebalance(self, auto_rebalance): assert_is_type(auto_rebalance, None, bool) self._parms["auto_rebalance"] = auto_rebalance