Source code for h2o.estimators.rulefit

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.utils.metaclass import deprecated_params, deprecated_property
import h2o
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2ORuleFitEstimator(H2OEstimator): """ RuleFit Builds a RuleFit on a parsed dataset, for regression or classification. """ algo = "rulefit" supervised_learning = True @deprecated_params({'Lambda': 'lambda_'}) def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] seed=-1, # type: int response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] algorithm="auto", # type: Literal["auto", "drf", "gbm"] min_rule_length=3, # type: int max_rule_length=3, # type: int max_num_rules=-1, # type: int model_type="rules_and_linear", # type: Literal["rules_and_linear", "rules", "linear"] weights_column=None, # type: Optional[str] distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] rule_generation_ntrees=50, # type: int auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] remove_duplicates=True, # type: bool lambda_=None, # type: Optional[List[float]] max_categorical_levels=10, # type: int ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param seed: Seed for pseudo random number generator (if applicable). Defaults to ``-1``. :type seed: int :param response_column: Response variable column. Defaults to ``None``. :type response_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param algorithm: The algorithm to use to generate rules. Defaults to ``"auto"``. :type algorithm: Literal["auto", "drf", "gbm"] :param min_rule_length: Minimum length of rules. Defaults to 3. Defaults to ``3``. :type min_rule_length: int :param max_rule_length: Maximum length of rules. Defaults to 3. Defaults to ``3``. :type max_rule_length: int :param max_num_rules: The maximum number of rules to return. defaults to -1 which means the number of rules is selected by diminishing returns in model deviance. Defaults to ``-1``. :type max_num_rules: int :param model_type: Specifies type of base learners in the ensemble. Defaults to ``"rules_and_linear"``. :type model_type: Literal["rules_and_linear", "rules", "linear"] :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Defaults to ``None``. :type weights_column: str, optional :param distribution: Distribution function Defaults to ``"auto"``. :type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] :param rule_generation_ntrees: Specifies the number of trees to build in the tree model. Defaults to 50. Defaults to ``50``. :type rule_generation_ntrees: int :param auc_type: Set default multinomial AUC type. Defaults to ``"auto"``. :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] :param remove_duplicates: Whether to remove rules which are identical to an earlier rule. Defaults to true. Defaults to ``True``. :type remove_duplicates: bool :param lambda_: Lambda for LASSO regressor. Defaults to ``None``. :type lambda_: List[float], optional :param max_categorical_levels: For every categorical feature, only use this many most frequent categorical levels for model training. Only used for categorical_encoding == EnumLimited. Defaults to ``10``. :type max_categorical_levels: int """ super(H2ORuleFitEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.seed = seed self.response_column = response_column self.ignored_columns = ignored_columns self.algorithm = algorithm self.min_rule_length = min_rule_length self.max_rule_length = max_rule_length self.max_num_rules = max_num_rules self.model_type = model_type self.weights_column = weights_column self.distribution = distribution self.rule_generation_ntrees = rule_generation_ntrees self.auc_type = auc_type self.remove_duplicates = remove_duplicates self.lambda_ = lambda_ self.max_categorical_levels = max_categorical_levels @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def seed(self): """ Seed for pseudo random number generator (if applicable). Type: ``int``, defaults to ``-1``. """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def algorithm(self): """ The algorithm to use to generate rules. Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``. """ return self._parms.get("algorithm") @algorithm.setter def algorithm(self, algorithm): assert_is_type(algorithm, None, Enum("auto", "drf", "gbm")) self._parms["algorithm"] = algorithm @property def min_rule_length(self): """ Minimum length of rules. Defaults to 3. Type: ``int``, defaults to ``3``. """ return self._parms.get("min_rule_length") @min_rule_length.setter def min_rule_length(self, min_rule_length): assert_is_type(min_rule_length, None, int) self._parms["min_rule_length"] = min_rule_length @property def max_rule_length(self): """ Maximum length of rules. Defaults to 3. Type: ``int``, defaults to ``3``. """ return self._parms.get("max_rule_length") @max_rule_length.setter def max_rule_length(self, max_rule_length): assert_is_type(max_rule_length, None, int) self._parms["max_rule_length"] = max_rule_length @property def max_num_rules(self): """ The maximum number of rules to return. defaults to -1 which means the number of rules is selected by diminishing returns in model deviance. Type: ``int``, defaults to ``-1``. """ return self._parms.get("max_num_rules") @max_num_rules.setter def max_num_rules(self, max_num_rules): assert_is_type(max_num_rules, None, int) self._parms["max_num_rules"] = max_num_rules @property def model_type(self): """ Specifies type of base learners in the ensemble. Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``. """ return self._parms.get("model_type") @model_type.setter def model_type(self, model_type): assert_is_type(model_type, None, Enum("rules_and_linear", "rules", "linear")) self._parms["model_type"] = model_type @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Type: ``str``. """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def distribution(self): """ Distribution function Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]``, defaults to ``"auto"``. """ return self._parms.get("distribution") @distribution.setter def distribution(self, distribution): assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber")) self._parms["distribution"] = distribution @property def rule_generation_ntrees(self): """ Specifies the number of trees to build in the tree model. Defaults to 50. Type: ``int``, defaults to ``50``. """ return self._parms.get("rule_generation_ntrees") @rule_generation_ntrees.setter def rule_generation_ntrees(self, rule_generation_ntrees): assert_is_type(rule_generation_ntrees, None, int) self._parms["rule_generation_ntrees"] = rule_generation_ntrees @property def auc_type(self): """ Set default multinomial AUC type. Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to ``"auto"``. """ return self._parms.get("auc_type") @auc_type.setter def auc_type(self, auc_type): assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo")) self._parms["auc_type"] = auc_type @property def remove_duplicates(self): """ Whether to remove rules which are identical to an earlier rule. Defaults to true. Type: ``bool``, defaults to ``True``. """ return self._parms.get("remove_duplicates") @remove_duplicates.setter def remove_duplicates(self, remove_duplicates): assert_is_type(remove_duplicates, None, bool) self._parms["remove_duplicates"] = remove_duplicates @property def lambda_(self): """ Lambda for LASSO regressor. Type: ``List[float]``. """ return self._parms.get("lambda") @lambda_.setter def lambda_(self, lambda_): assert_is_type(lambda_, None, numeric, [numeric]) self._parms["lambda"] = lambda_ @property def max_categorical_levels(self): """ For every categorical feature, only use this many most frequent categorical levels for model training. Only used for categorical_encoding == EnumLimited. Type: ``int``, defaults to ``10``. """ return self._parms.get("max_categorical_levels") @max_categorical_levels.setter def max_categorical_levels(self, max_categorical_levels): assert_is_type(max_categorical_levels, None, int) self._parms["max_categorical_levels"] = max_categorical_levels Lambda = deprecated_property('Lambda', lambda_)
[docs] def rule_importance(self): """ Retrieve rule importances for a Rulefit model :return: H2OTwoDimTable """ if self._model_json["algo"] != "rulefit": raise H2OValueError("This function is available for Rulefit models only") kwargs = {} kwargs["model_id"] = self.model_id json = h2o.api("POST /3/SignificantRules", data=kwargs) return json['significant_rules_table']
[docs] def predict_rules(self, frame, rule_ids): """ Evaluates validity of the given rules on the given data. :param frame: H2OFrame on which rule validity is to be evaluated :param rule_ids: string array of rule ids to be evaluated against the frame :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not. """ from h2o.frame import H2OFrame from h2o.utils.typechecks import assert_is_type from h2o.expr import ExprNode assert_is_type(frame, H2OFrame) return H2OFrame._expr(expr=ExprNode("rulefit.predict.rules", self, frame, rule_ids))