Source code for h2o.estimators.extended_isolation_forest

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OExtendedIsolationForestEstimator(H2OEstimator):
    """
    Extended Isolation Forest

    Builds an Extended Isolation Forest model. Extended Isolation Forest generalizes its predecessor algorithm, 
    Isolation Forest. The original Isolation Forest algorithm suffers from bias due to tree branching. Extension of the 
    algorithm mitigates the bias by adjusting the branching, and the original algorithm becomes just a special case.
    Extended Isolation Forest's attribute "extension_level" allows leveraging the generalization. The minimum value is 0 and
    means the Isolation Forest's behavior. Maximum value is (numCols - 1) and stands for full extension. The rest of the 
    algorithm is analogical to the Isolation Forest algorithm. Each iteration builds a tree that partitions the sample 
    observations' space until it isolates observation. The length of the path from root to a leaf node of the resulting tree
    is used to calculate the anomaly score. Anomalies are easier to isolate, and their average
    tree path is expected to be shorter than paths of regular observations. Anomaly score is a number between 0 and 1. 
    A number closer to 0 is a normal point, and a number closer to 1 is a more anomalous point.
    """

    algo = "extendedisolationforest"
    supervised_learning = False

    def __init__(self,
                 model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
                 training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 ignored_columns=None,  # type: Optional[List[str]]
                 ignore_const_cols=True,  # type: bool
                 categorical_encoding="auto",  # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
                 score_each_iteration=False,  # type: bool
                 score_tree_interval=0,  # type: int
                 ntrees=100,  # type: int
                 sample_size=256,  # type: int
                 extension_level=0,  # type: int
                 seed=-1,  # type: int
                 disable_training_metrics=True,  # type: bool
                 ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param ignore_const_cols: Ignore constant columns.
               Defaults to ``True``.
        :type ignore_const_cols: bool
        :param categorical_encoding: Encoding scheme for categorical features
               Defaults to ``"auto"``.
        :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
               "sort_by_response", "enum_limited"]
        :param score_each_iteration: Whether to score during each iteration of model training.
               Defaults to ``False``.
        :type score_each_iteration: bool
        :param score_tree_interval: Score the model after every so many trees. Disabled if set to 0.
               Defaults to ``0``.
        :type score_tree_interval: int
        :param ntrees: Number of Extended Isolation Forest trees.
               Defaults to ``100``.
        :type ntrees: int
        :param sample_size: Number of randomly sampled observations used to train each Extended Isolation Forest tree.
               Defaults to ``256``.
        :type sample_size: int
        :param extension_level: Maximum is N - 1 (N = numCols). Minimum is 0. Extended Isolation Forest with
               extension_Level = 0 behaves like Isolation Forest.
               Defaults to ``0``.
        :type extension_level: int
        :param seed: Seed for pseudo random number generator (if applicable)
               Defaults to ``-1``.
        :type seed: int
        :param disable_training_metrics: Disable calculating training metrics (expensive on large datasets)
               Defaults to ``True``.
        :type disable_training_metrics: bool
        """
        super(H2OExtendedIsolationForestEstimator, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.ignored_columns = ignored_columns
        self.ignore_const_cols = ignore_const_cols
        self.categorical_encoding = categorical_encoding
        self.score_each_iteration = score_each_iteration
        self.score_tree_interval = score_tree_interval
        self.ntrees = ntrees
        self.sample_size = sample_size
        self.extension_level = extension_level
        self.seed = seed
        self.disable_training_metrics = disable_training_metrics

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
        >>> predictors = ["displacement","power","weight","acceleration","year"]
        >>> cars_eif = H2OExtendedIsolationForestEstimator(seed = 1234, 
        ...                                                sample_size = 256, 
        ...                                                extension_level = cars.dim[1] - 1)
        >>> cars_eif.train(x = predictors,
        ...                training_frame = cars)
        >>> print(cars_eif)
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def ignore_const_cols(self):
        """
        Ignore constant columns.

        Type: ``bool``, defaults to ``True``.

        :examples:

        >>> cars = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
        >>> predictors = ["displacement","power","weight","acceleration","year","const_1","const_2"]
        >>> cars["const_1"] = 6
        >>> cars["const_2"] = 7
        >>> train, valid = cars.split_frame(ratios = [.8], seed = 1234)
        >>> cars_eif = H2OExtendedIsolationForestEstimator(seed = 1234,
        ...                                                ignore_const_cols = True)
        >>> cars_eif.train(x = predictors,
        ...               training_frame = cars)
        >>> cars_eif.model_performance()
        """
        return self._parms.get("ignore_const_cols")

    @ignore_const_cols.setter
    def ignore_const_cols(self, ignore_const_cols):
        assert_is_type(ignore_const_cols, None, bool)
        self._parms["ignore_const_cols"] = ignore_const_cols

    @property
    def categorical_encoding(self):
        """
        Encoding scheme for categorical features

        Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
        "sort_by_response", "enum_limited"]``, defaults to ``"auto"``.

        :examples:

        >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
        >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
        ...               "DayOfWeek", "Month", "Distance", "FlightNum"]
        >>> encoding = "one_hot_explicit"
        >>> airlines_eif = H2OExtendedIsolationForestEstimator(categorical_encoding = encoding,
        ...                                                    seed = 1234)
        >>> airlines_eif.train(x = predictors,
        ...                   training_frame = airlines)
        >>> airlines_eif.model_performance()
        """
        return self._parms.get("categorical_encoding")

    @categorical_encoding.setter
    def categorical_encoding(self, categorical_encoding):
        assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
        self._parms["categorical_encoding"] = categorical_encoding

    @property
    def score_each_iteration(self):
        """
        Whether to score during each iteration of model training.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("score_each_iteration")

    @score_each_iteration.setter
    def score_each_iteration(self, score_each_iteration):
        assert_is_type(score_each_iteration, None, bool)
        self._parms["score_each_iteration"] = score_each_iteration

    @property
    def score_tree_interval(self):
        """
        Score the model after every so many trees. Disabled if set to 0.

        Type: ``int``, defaults to ``0``.
        """
        return self._parms.get("score_tree_interval")

    @score_tree_interval.setter
    def score_tree_interval(self, score_tree_interval):
        assert_is_type(score_tree_interval, None, int)
        self._parms["score_tree_interval"] = score_tree_interval

    @property
    def ntrees(self):
        """
        Number of Extended Isolation Forest trees.

        Type: ``int``, defaults to ``100``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = titanic.columns
        >>> tree_num = [20, 50, 80, 110, 140, 170, 200]
        >>> label = ["20", "50", "80", "110", "140", "170", "200"]
        >>> for key, num in enumerate(tree_num):
        ...     titanic_eif = H2OExtendedIsolationForestEstimator(ntrees = num,
        ...                                                       seed = 1234,
        ...                                                       extension_level = titanic.dim[1] - 1)
        ...     titanic_eif.train(x = predictors,
        ...                      training_frame = titanic) 
        """
        return self._parms.get("ntrees")

    @ntrees.setter
    def ntrees(self, ntrees):
        assert_is_type(ntrees, None, int)
        self._parms["ntrees"] = ntrees

    @property
    def sample_size(self):
        """
        Number of randomly sampled observations used to train each Extended Isolation Forest tree.

        Type: ``int``, defaults to ``256``.

        :examples:

        >>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/ecg_discord_train.csv")
        >>> eif_model = H2OExtendedIsolationForestEstimator(sample_size = 5,
        ...                                                 ntrees=7)
        >>> eif_model.train(training_frame = train)
        >>> print(eif_model)
        """
        return self._parms.get("sample_size")

    @sample_size.setter
    def sample_size(self, sample_size):
        assert_is_type(sample_size, None, int)
        self._parms["sample_size"] = sample_size

    @property
    def extension_level(self):
        """
        Maximum is N - 1 (N = numCols). Minimum is 0. Extended Isolation Forest with extension_Level = 0 behaves like
        Isolation Forest.

        Type: ``int``, defaults to ``0``.

        :examples:

        >>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/anomaly/single_blob.csv")
        >>> eif_model = H2OExtendedIsolationForestEstimator(extension_level = 1,
        ...                                                 ntrees=7)
        >>> eif_model.train(training_frame = train)
        >>> print(eif_model)
        """
        return self._parms.get("extension_level")

    @extension_level.setter
    def extension_level(self, extension_level):
        assert_is_type(extension_level, None, int)
        self._parms["extension_level"] = extension_level

    @property
    def seed(self):
        """
        Seed for pseudo random number generator (if applicable)

        Type: ``int``, defaults to ``-1``.

        :examples:

        >>> airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
        >>> predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
        ...               "DayOfWeek", "Month", "Distance", "FlightNum"]
        >>> eif_w_seed = H2OExtendedIsolationForestEstimator(seed = 1234) 
        >>> eif_w_seed.train(x = predictors,
        ...                        training_frame = airlines)
        >>> eif_wo_seed = H2OExtendedIsolationForestEstimator()
        >>> eif_wo_seed.train(x = predictors,
        ...                         training_frame = airlines)
        >>> print(eif_w_seed)
        >>> print(eif_wo_seed)
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    @property
    def disable_training_metrics(self):
        """
        Disable calculating training metrics (expensive on large datasets)

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("disable_training_metrics")

    @disable_training_metrics.setter
    def disable_training_metrics(self, disable_training_metrics):
        assert_is_type(disable_training_metrics, None, bool)
        self._parms["disable_training_metrics"] = disable_training_metrics