Source code for h2o.estimators.infogram

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

import ast
import json
import warnings
import h2o
from h2o.utils.shared_utils import can_use_numpy
from h2o.utils.typechecks import is_type
from h2o.plot import get_matplotlib_pyplot, decorate_plot_result, get_polycollection
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OInfogram(H2OEstimator):
    """
    Information Diagram

    The infogram is a graphical information-theoretic interpretability tool which allows the user to quickly spot the core, decision-making variables 
    that uniquely and safely drive the response, in supervised classification problems. The infogram can significantly cut down the number of predictors needed to build 
    a model by identifying only the most valuable, admissible features. When protected variables such as race or gender are present in the data, the admissibility 
    of a variable is determined by a safety and relevancy index, and thus serves as a diagnostic tool for fairness. The safety of each feature can be quantified and 
    variables that are unsafe will be considered inadmissible. Models built using only admissible features will naturally be more interpretable, given the reduced 
    feature set.  Admissible models are also less susceptible to overfitting and train faster, while providing similar accuracy as models built using all available features.
    """

    algo = "infogram"
    supervised_learning = True

    def __init__(self,
                 model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
                 training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 validation_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 seed=-1,  # type: int
                 keep_cross_validation_models=True,  # type: bool
                 keep_cross_validation_predictions=False,  # type: bool
                 keep_cross_validation_fold_assignment=False,  # type: bool
                 nfolds=0,  # type: int
                 fold_assignment="auto",  # type: Literal["auto", "random", "modulo", "stratified"]
                 fold_column=None,  # type: Optional[str]
                 response_column=None,  # type: Optional[str]
                 ignored_columns=None,  # type: Optional[List[str]]
                 ignore_const_cols=True,  # type: bool
                 score_each_iteration=False,  # type: bool
                 offset_column=None,  # type: Optional[str]
                 weights_column=None,  # type: Optional[str]
                 standardize=False,  # type: bool
                 distribution="auto",  # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]
                 plug_values=None,  # type: Optional[Union[None, str, H2OFrame]]
                 max_iterations=0,  # type: int
                 stopping_rounds=0,  # type: int
                 stopping_metric="auto",  # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
                 stopping_tolerance=0.001,  # type: float
                 balance_classes=False,  # type: bool
                 class_sampling_factors=None,  # type: Optional[List[float]]
                 max_after_balance_size=5.0,  # type: float
                 max_runtime_secs=0.0,  # type: float
                 custom_metric_func=None,  # type: Optional[str]
                 auc_type="auto",  # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
                 algorithm="auto",  # type: Literal["auto", "deeplearning", "drf", "gbm", "glm", "xgboost"]
                 algorithm_params=None,  # type: Optional[dict]
                 protected_columns=None,  # type: Optional[List[str]]
                 total_information_threshold=-1.0,  # type: float
                 net_information_threshold=-1.0,  # type: float
                 relevance_index_threshold=-1.0,  # type: float
                 safety_index_threshold=-1.0,  # type: float
                 data_fraction=1.0,  # type: float
                 top_n_features=50,  # type: int
                 ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param validation_frame: Id of the validation data frame.
               Defaults to ``None``.
        :type validation_frame: Union[None, str, H2OFrame], optional
        :param seed: Seed for pseudo random number generator (if applicable).
               Defaults to ``-1``.
        :type seed: int
        :param keep_cross_validation_models: Whether to keep the cross-validation models.
               Defaults to ``True``.
        :type keep_cross_validation_models: bool
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models.
               Defaults to ``False``.
        :type keep_cross_validation_predictions: bool
        :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment.
               Defaults to ``False``.
        :type keep_cross_validation_fold_assignment: bool
        :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2).
               Defaults to ``0``.
        :type nfolds: int
        :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The
               'Stratified' option will stratify the folds based on the response variable, for classification problems.
               Defaults to ``"auto"``.
        :type fold_assignment: Literal["auto", "random", "modulo", "stratified"]
        :param fold_column: Column with cross-validation fold index assignment per observation.
               Defaults to ``None``.
        :type fold_column: str, optional
        :param response_column: Response variable column.
               Defaults to ``None``.
        :type response_column: str, optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param ignore_const_cols: Ignore constant columns.
               Defaults to ``True``.
        :type ignore_const_cols: bool
        :param score_each_iteration: Whether to score during each iteration of model training.
               Defaults to ``False``.
        :type score_each_iteration: bool
        :param offset_column: Offset column. This will be added to the combination of columns before applying the link
               function.
               Defaults to ``None``.
        :type offset_column: str, optional
        :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
               to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
               that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
               not increase the size of the data frame. This is typically the number of times a row is repeated, but
               non-integer values are supported as well. During training, rows with higher weights matter more, due to
               the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
               that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
               Defaults to ``None``.
        :type weights_column: str, optional
        :param standardize: Standardize numeric columns to have zero mean and unit variance.
               Defaults to ``False``.
        :type standardize: bool
        :param distribution: Distribution function
               Defaults to ``"auto"``.
        :type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
               "quantile", "huber"]
        :param plug_values: Plug Values (a single row frame containing values that will be used to impute missing values
               of the training/validation frame, use with conjunction missing_values_handling = PlugValues).
               Defaults to ``None``.
        :type plug_values: Union[None, str, H2OFrame], optional
        :param max_iterations: Maximum number of iterations.
               Defaults to ``0``.
        :type max_iterations: int
        :param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of
               length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)
               Defaults to ``0``.
        :type stopping_rounds: int
        :param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for
               regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be
               used in GBM and DRF with the Python client.
               Defaults to ``"auto"``.
        :type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
               "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
        :param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement
               is not at least this much)
               Defaults to ``0.001``.
        :type stopping_tolerance: float
        :param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).
               Defaults to ``False``.
        :type balance_classes: bool
        :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not
               specified, sampling factors will be automatically computed to obtain class balance during training.
               Requires balance_classes.
               Defaults to ``None``.
        :type class_sampling_factors: List[float], optional
        :param max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be
               less than 1.0). Requires balance_classes.
               Defaults to ``5.0``.
        :type max_after_balance_size: float
        :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
               Defaults to ``0.0``.
        :type max_runtime_secs: float
        :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName`
               Defaults to ``None``.
        :type custom_metric_func: str, optional
        :param auc_type: Set default multinomial AUC type.
               Defaults to ``"auto"``.
        :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
        :param algorithm: Type of machine learning algorithm used to build the infogram. Options include 'AUTO' (gbm),
               'deeplearning' (Deep Learning with default parameters), 'drf' (Random Forest with default parameters),
               'gbm' (GBM with default parameters), 'glm' (GLM with default parameters), or 'xgboost' (if available,
               XGBoost with default parameters).
               Defaults to ``"auto"``.
        :type algorithm: Literal["auto", "deeplearning", "drf", "gbm", "glm", "xgboost"]
        :param algorithm_params: Customized parameters for the machine learning algorithm specified in the algorithm
               parameter.
               Defaults to ``None``.
        :type algorithm_params: dict, optional
        :param protected_columns: Columns that contain features that are sensitive and need to be protected (legally, or
               otherwise), if applicable. These features (e.g. race, gender, etc) should not drive the prediction of the
               response.
               Defaults to ``None``.
        :type protected_columns: List[str], optional
        :param total_information_threshold: A number between 0 and 1 representing a threshold for total information,
               defaulting to 0.1. For a specific feature, if the total information is higher than this threshold, and
               the corresponding net information is also higher than the threshold ``net_information_threshold``, that
               feature will be considered admissible. The total information is the x-axis of the Core Infogram. Default
               is -1 which gets set to 0.1.
               Defaults to ``-1.0``.
        :type total_information_threshold: float
        :param net_information_threshold: A number between 0 and 1 representing a threshold for net information,
               defaulting to 0.1.  For a specific feature, if the net information is higher than this threshold, and the
               corresponding total information is also higher than the total_information_threshold, that feature will be
               considered admissible. The net information is the y-axis of the Core Infogram. Default is -1 which gets
               set to 0.1.
               Defaults to ``-1.0``.
        :type net_information_threshold: float
        :param relevance_index_threshold: A number between 0 and 1 representing a threshold for the relevance index,
               defaulting to 0.1.  This is only used when ``protected_columns`` is set by the user.  For a specific
               feature, if the relevance index value is higher than this threshold, and the corresponding safety index
               is also higher than the safety_index_threshold``, that feature will be considered admissible.  The
               relevance index is the x-axis of the Fair Infogram. Default is -1 which gets set to 0.1.
               Defaults to ``-1.0``.
        :type relevance_index_threshold: float
        :param safety_index_threshold: A number between 0 and 1 representing a threshold for the safety index,
               defaulting to 0.1.  This is only used when protected_columns is set by the user.  For a specific feature,
               if the safety index value is higher than this threshold, and the corresponding relevance index is also
               higher than the relevance_index_threshold, that feature will be considered admissible.  The safety index
               is the y-axis of the Fair Infogram. Default is -1 which gets set to 0.1.
               Defaults to ``-1.0``.
        :type safety_index_threshold: float
        :param data_fraction: The fraction of training frame to use to build the infogram model. Defaults to 1.0, and
               any value greater than 0 and less than or equal to 1.0 is acceptable.
               Defaults to ``1.0``.
        :type data_fraction: float
        :param top_n_features: An integer specifying the number of columns to evaluate in the infogram.  The columns are
               ranked by variable importance, and the top N are evaluated.  Defaults to 50.
               Defaults to ``50``.
        :type top_n_features: int
        """
        super(H2OInfogram, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.validation_frame = validation_frame
        self.seed = seed
        self.keep_cross_validation_models = keep_cross_validation_models
        self.keep_cross_validation_predictions = keep_cross_validation_predictions
        self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment
        self.nfolds = nfolds
        self.fold_assignment = fold_assignment
        self.fold_column = fold_column
        self.response_column = response_column
        self.ignored_columns = ignored_columns
        self.ignore_const_cols = ignore_const_cols
        self.score_each_iteration = score_each_iteration
        self.offset_column = offset_column
        self.weights_column = weights_column
        self.standardize = standardize
        self.distribution = distribution
        self.plug_values = plug_values
        self.max_iterations = max_iterations
        self.stopping_rounds = stopping_rounds
        self.stopping_metric = stopping_metric
        self.stopping_tolerance = stopping_tolerance
        self.balance_classes = balance_classes
        self.class_sampling_factors = class_sampling_factors
        self.max_after_balance_size = max_after_balance_size
        self.max_runtime_secs = max_runtime_secs
        self.custom_metric_func = custom_metric_func
        self.auc_type = auc_type
        self.algorithm = algorithm
        self.algorithm_params = algorithm_params
        self.protected_columns = protected_columns
        self.total_information_threshold = total_information_threshold
        self.net_information_threshold = net_information_threshold
        self.relevance_index_threshold = relevance_index_threshold
        self.safety_index_threshold = safety_index_threshold
        self.data_fraction = data_fraction
        self.top_n_features = top_n_features
        self._parms["_rest_version"] = 3

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')

    @property
    def validation_frame(self):
        """
        Id of the validation data frame.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("validation_frame")

    @validation_frame.setter
    def validation_frame(self, validation_frame):
        self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')

    @property
    def seed(self):
        """
        Seed for pseudo random number generator (if applicable).

        Type: ``int``, defaults to ``-1``.
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    @property
    def keep_cross_validation_models(self):
        """
        Whether to keep the cross-validation models.

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("keep_cross_validation_models")

    @keep_cross_validation_models.setter
    def keep_cross_validation_models(self, keep_cross_validation_models):
        assert_is_type(keep_cross_validation_models, None, bool)
        self._parms["keep_cross_validation_models"] = keep_cross_validation_models

    @property
    def keep_cross_validation_predictions(self):
        """
        Whether to keep the predictions of the cross-validation models.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("keep_cross_validation_predictions")

    @keep_cross_validation_predictions.setter
    def keep_cross_validation_predictions(self, keep_cross_validation_predictions):
        assert_is_type(keep_cross_validation_predictions, None, bool)
        self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions

    @property
    def keep_cross_validation_fold_assignment(self):
        """
        Whether to keep the cross-validation fold assignment.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("keep_cross_validation_fold_assignment")

    @keep_cross_validation_fold_assignment.setter
    def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment):
        assert_is_type(keep_cross_validation_fold_assignment, None, bool)
        self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment

    @property
    def nfolds(self):
        """
        Number of folds for K-fold cross-validation (0 to disable or >= 2).

        Type: ``int``, defaults to ``0``.
        """
        return self._parms.get("nfolds")

    @nfolds.setter
    def nfolds(self, nfolds):
        assert_is_type(nfolds, None, int)
        self._parms["nfolds"] = nfolds

    @property
    def fold_assignment(self):
        """
        Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify
        the folds based on the response variable, for classification problems.

        Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``.
        """
        return self._parms.get("fold_assignment")

    @fold_assignment.setter
    def fold_assignment(self, fold_assignment):
        assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified"))
        self._parms["fold_assignment"] = fold_assignment

    @property
    def fold_column(self):
        """
        Column with cross-validation fold index assignment per observation.

        Type: ``str``.
        """
        return self._parms.get("fold_column")

    @fold_column.setter
    def fold_column(self, fold_column):
        assert_is_type(fold_column, None, str)
        self._parms["fold_column"] = fold_column

    @property
    def response_column(self):
        """
        Response variable column.

        Type: ``str``.
        """
        return self._parms.get("response_column")

    @response_column.setter
    def response_column(self, response_column):
        assert_is_type(response_column, None, str)
        self._parms["response_column"] = response_column

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def ignore_const_cols(self):
        """
        Ignore constant columns.

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("ignore_const_cols")

    @ignore_const_cols.setter
    def ignore_const_cols(self, ignore_const_cols):
        assert_is_type(ignore_const_cols, None, bool)
        self._parms["ignore_const_cols"] = ignore_const_cols

    @property
    def score_each_iteration(self):
        """
        Whether to score during each iteration of model training.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("score_each_iteration")

    @score_each_iteration.setter
    def score_each_iteration(self, score_each_iteration):
        assert_is_type(score_each_iteration, None, bool)
        self._parms["score_each_iteration"] = score_each_iteration

    @property
    def offset_column(self):
        """
        Offset column. This will be added to the combination of columns before applying the link function.

        Type: ``str``.
        """
        return self._parms.get("offset_column")

    @offset_column.setter
    def offset_column(self, offset_column):
        assert_is_type(offset_column, None, str)
        self._parms["offset_column"] = offset_column

    @property
    def weights_column(self):
        """
        Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
        dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
        frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
        During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
        weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
        accurate prediction, remove all rows with weight == 0.

        Type: ``str``.
        """
        return self._parms.get("weights_column")

    @weights_column.setter
    def weights_column(self, weights_column):
        assert_is_type(weights_column, None, str)
        self._parms["weights_column"] = weights_column

    @property
    def standardize(self):
        """
        Standardize numeric columns to have zero mean and unit variance.

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("standardize")

    @standardize.setter
    def standardize(self, standardize):
        assert_is_type(standardize, None, bool)
        self._parms["standardize"] = standardize

    @property
    def distribution(self):
        """
        Distribution function

        Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
        "quantile", "huber"]``, defaults to ``"auto"``.
        """
        return self._parms.get("distribution")

    @distribution.setter
    def distribution(self, distribution):
        assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"))
        self._parms["distribution"] = distribution

    @property
    def plug_values(self):
        """
        Plug Values (a single row frame containing values that will be used to impute missing values of the
        training/validation frame, use with conjunction missing_values_handling = PlugValues).

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("plug_values")

    @plug_values.setter
    def plug_values(self, plug_values):
        self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values')

    @property
    def max_iterations(self):
        """
        Maximum number of iterations.

        Type: ``int``, defaults to ``0``.
        """
        return self._parms.get("max_iterations")

    @max_iterations.setter
    def max_iterations(self, max_iterations):
        assert_is_type(max_iterations, None, int)
        self._parms["max_iterations"] = max_iterations

    @property
    def stopping_rounds(self):
        """
        Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
        stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable)

        Type: ``int``, defaults to ``0``.
        """
        return self._parms.get("stopping_rounds")

    @stopping_rounds.setter
    def stopping_rounds(self, stopping_rounds):
        assert_is_type(stopping_rounds, None, int)
        self._parms["stopping_rounds"] = stopping_rounds

    @property
    def stopping_metric(self):
        """
        Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score
        for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
        client.

        Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group",
        "misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``.
        """
        return self._parms.get("stopping_metric")

    @stopping_metric.setter
    def stopping_metric(self, stopping_metric):
        assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"))
        self._parms["stopping_metric"] = stopping_metric

    @property
    def stopping_tolerance(self):
        """
        Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much)

        Type: ``float``, defaults to ``0.001``.
        """
        return self._parms.get("stopping_tolerance")

    @stopping_tolerance.setter
    def stopping_tolerance(self, stopping_tolerance):
        assert_is_type(stopping_tolerance, None, numeric)
        self._parms["stopping_tolerance"] = stopping_tolerance

    @property
    def balance_classes(self):
        """
        Balance training data class counts via over/under-sampling (for imbalanced data).

        Type: ``bool``, defaults to ``False``.
        """
        return self._parms.get("balance_classes")

    @balance_classes.setter
    def balance_classes(self, balance_classes):
        assert_is_type(balance_classes, None, bool)
        self._parms["balance_classes"] = balance_classes

    @property
    def class_sampling_factors(self):
        """
        Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
        be automatically computed to obtain class balance during training. Requires balance_classes.

        Type: ``List[float]``.
        """
        return self._parms.get("class_sampling_factors")

    @class_sampling_factors.setter
    def class_sampling_factors(self, class_sampling_factors):
        assert_is_type(class_sampling_factors, None, [float])
        self._parms["class_sampling_factors"] = class_sampling_factors

    @property
    def max_after_balance_size(self):
        """
        Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
        balance_classes.

        Type: ``float``, defaults to ``5.0``.
        """
        return self._parms.get("max_after_balance_size")

    @max_after_balance_size.setter
    def max_after_balance_size(self, max_after_balance_size):
        assert_is_type(max_after_balance_size, None, float)
        self._parms["max_after_balance_size"] = max_after_balance_size

    @property
    def max_runtime_secs(self):
        """
        Maximum allowed runtime in seconds for model training. Use 0 to disable.

        Type: ``float``, defaults to ``0.0``.
        """
        return self._parms.get("max_runtime_secs")

    @max_runtime_secs.setter
    def max_runtime_secs(self, max_runtime_secs):
        assert_is_type(max_runtime_secs, None, numeric)
        self._parms["max_runtime_secs"] = max_runtime_secs

    @property
    def custom_metric_func(self):
        """
        Reference to custom evaluation function, format: `language:keyName=funcName`

        Type: ``str``.
        """
        return self._parms.get("custom_metric_func")

    @custom_metric_func.setter
    def custom_metric_func(self, custom_metric_func):
        assert_is_type(custom_metric_func, None, str)
        self._parms["custom_metric_func"] = custom_metric_func

    @property
    def auc_type(self):
        """
        Set default multinomial AUC type.

        Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
        ``"auto"``.
        """
        return self._parms.get("auc_type")

    @auc_type.setter
    def auc_type(self, auc_type):
        assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
        self._parms["auc_type"] = auc_type

    @property
    def algorithm(self):
        """
        Type of machine learning algorithm used to build the infogram. Options include 'AUTO' (gbm), 'deeplearning'
        (Deep Learning with default parameters), 'drf' (Random Forest with default parameters), 'gbm' (GBM with default
        parameters), 'glm' (GLM with default parameters), or 'xgboost' (if available, XGBoost with default parameters).

        Type: ``Literal["auto", "deeplearning", "drf", "gbm", "glm", "xgboost"]``, defaults to ``"auto"``.
        """
        return self._parms.get("algorithm")

    @algorithm.setter
    def algorithm(self, algorithm):
        assert_is_type(algorithm, None, Enum("auto", "deeplearning", "drf", "gbm", "glm", "xgboost"))
        self._parms["algorithm"] = algorithm

    @property
    def algorithm_params(self):
        """
        Customized parameters for the machine learning algorithm specified in the algorithm parameter.

        Type: ``dict``.
        """
        if self._parms.get("algorithm_params") != None:
            algorithm_params_dict =  ast.literal_eval(self._parms.get("algorithm_params"))
            for k in algorithm_params_dict:
                if len(algorithm_params_dict[k]) == 1: #single parameter
                    algorithm_params_dict[k] = algorithm_params_dict[k][0]
            return algorithm_params_dict
        else:
            return self._parms.get("algorithm_params")

    @algorithm_params.setter
    def algorithm_params(self, algorithm_params):
        assert_is_type(algorithm_params, None, dict)
        if algorithm_params is not None and algorithm_params != "":
            for k in algorithm_params:
                if ("[" and "]") not in str(algorithm_params[k]):
                    algorithm_params[k] = [algorithm_params[k]]
            self._parms["algorithm_params"] = str(json.dumps(algorithm_params))
        else:
            self._parms["algorithm_params"] = None

    @property
    def protected_columns(self):
        """
        Columns that contain features that are sensitive and need to be protected (legally, or otherwise), if
        applicable. These features (e.g. race, gender, etc) should not drive the prediction of the response.

        Type: ``List[str]``.
        """
        return self._parms.get("protected_columns")

    @protected_columns.setter
    def protected_columns(self, protected_columns):
        assert_is_type(protected_columns, None, [str])
        self._parms["protected_columns"] = protected_columns

    @property
    def total_information_threshold(self):
        """
        A number between 0 and 1 representing a threshold for total information, defaulting to 0.1. For a specific
        feature, if the total information is higher than this threshold, and the corresponding net information is also
        higher than the threshold ``net_information_threshold``, that feature will be considered admissible. The total
        information is the x-axis of the Core Infogram. Default is -1 which gets set to 0.1.

        Type: ``float``, defaults to ``-1.0``.
        """
        return self._parms.get("total_information_threshold")

    @total_information_threshold.setter
    def total_information_threshold(self, total_information_threshold):
        if total_information_threshold <= -1: # not set
            if self._parms["protected_columns"] is None:
                self._parms["total_information_threshold"] = 0.1
        else:
            if self._parms["protected_columns"] is not None: # fair infogram
                warnings.warn("Should not set total_information_threshold for fair infogram runs.  Set relevance_index_threshold instead.  Using default of 0.1 if not set", RuntimeWarning)
            else:
                self._parms["total_information_threshold"] = total_information_threshold

    @property
    def net_information_threshold(self):
        """
        A number between 0 and 1 representing a threshold for net information, defaulting to 0.1.  For a specific
        feature, if the net information is higher than this threshold, and the corresponding total information is also
        higher than the total_information_threshold, that feature will be considered admissible. The net information is
        the y-axis of the Core Infogram. Default is -1 which gets set to 0.1.

        Type: ``float``, defaults to ``-1.0``.
        """
        return self._parms.get("net_information_threshold")

    @net_information_threshold.setter
    def net_information_threshold(self, net_information_threshold):
        if net_information_threshold <= -1: # not set
            if self._parms["protected_columns"] is None:
                self._parms["net_information_threshold"]=0.1
        else:  # set
            if self._parms["protected_columns"] is not None: # fair infogram
                warnings.warn("Should not set net_information_threshold for fair infogram runs.  Set safety_index_threshold instead.  Using default of 0.1 if not set", RuntimeWarning)
            else:
                self._parms["net_information_threshold"]=net_information_threshold

    @property
    def relevance_index_threshold(self):
        """
        A number between 0 and 1 representing a threshold for the relevance index, defaulting to 0.1.  This is only used
        when ``protected_columns`` is set by the user.  For a specific feature, if the relevance index value is higher
        than this threshold, and the corresponding safety index is also higher than the safety_index_threshold``, that
        feature will be considered admissible.  The relevance index is the x-axis of the Fair Infogram. Default is -1
        which gets set to 0.1.

        Type: ``float``, defaults to ``-1.0``.
        """
        return self._parms.get("relevance_index_threshold")

    @relevance_index_threshold.setter
    def relevance_index_threshold(self, relevance_index_threshold):
        if relevance_index_threshold <= -1: # not set
            if self._parms["protected_columns"] is not None:    # fair infogram
                self._parms["relevance_index_threshold"]=0.1
        else: # it is set
            if self._parms["protected_columns"] is not None:    # fair infogram
                self._parms["relevance_index_threshold"] = relevance_index_threshold
            else: # core infogram should not have been set
                warnings.warn("Should not set relevance_index_threshold for core infogram runs.  Set total_information_threshold instead.  Using default of 0.1 if not set", RuntimeWarning)

    @property
    def safety_index_threshold(self):
        """
        A number between 0 and 1 representing a threshold for the safety index, defaulting to 0.1.  This is only used
        when protected_columns is set by the user.  For a specific feature, if the safety index value is higher than
        this threshold, and the corresponding relevance index is also higher than the relevance_index_threshold, that
        feature will be considered admissible.  The safety index is the y-axis of the Fair Infogram. Default is -1 which
        gets set to 0.1.

        Type: ``float``, defaults to ``-1.0``.
        """
        return self._parms.get("safety_index_threshold")

    @safety_index_threshold.setter
    def safety_index_threshold(self, safety_index_threshold):
        if safety_index_threshold <= -1: # not set
            if self._parms["protected_columns"] is not None:
                self._parms["safety_index_threshold"]=0.1
        else: # it is set
            if self._parms["protected_columns"] is not None: # fair infogram
                self._parms["safety_index_threshold"] = safety_index_threshold
            else: # core infogram should not have been set
                warnings.warn("Should not set safety_index_threshold for core infogram runs.  Set net_information_threshold instead.  Using default of 0.1 if not set", RuntimeWarning)

    @property
    def data_fraction(self):
        """
        The fraction of training frame to use to build the infogram model. Defaults to 1.0, and any value greater than 0
        and less than or equal to 1.0 is acceptable.

        Type: ``float``, defaults to ``1.0``.
        """
        return self._parms.get("data_fraction")

    @data_fraction.setter
    def data_fraction(self, data_fraction):
        assert_is_type(data_fraction, None, numeric)
        self._parms["data_fraction"] = data_fraction

    @property
    def top_n_features(self):
        """
        An integer specifying the number of columns to evaluate in the infogram.  The columns are ranked by variable
        importance, and the top N are evaluated.  Defaults to 50.

        Type: ``int``, defaults to ``50``.
        """
        return self._parms.get("top_n_features")

    @top_n_features.setter
    def top_n_features(self, top_n_features):
        assert_is_type(top_n_features, None, int)
        self._parms["top_n_features"] = top_n_features


    def _extract_x_from_model(self):
        """
        extract admissible features from an Infogram model.

        :return: List of predictors that are considered admissible
        """
        features = self._model_json.get('output', {}).get('admissible_features')
        if features is None:
            raise ValueError("model %s doesn't have any admissible features" % self.key)
        return set(features)

[docs]    def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
        """
        Plot the infogram.  By default, it will plot the infogram calculated from training dataset.  
        Note that the frame rel_cmi_frame contains the following columns:
        - 0: predictor names
        - 1: admissible 
        - 2: admissible index
        - 3: relevance-index or total information
        - 4: safety-index or net information, normalized from 0 to 1
        - 5: safety-index or net information not normalized

        :param train: True if infogram is generated from training dataset
        :param valid: True if infogram is generated from validation dataset
        :param xval: True if infogram is generated from cross-validation holdout dataset
        :param figsize: size of infogram plot
        :param title: string to denote title of the plot
        :param legend_on: legend text is included if True
        :param server: True will not generate plot, False will produce plot
        :return: infogram plot if server=True or None if server=False
        """

        plt = get_matplotlib_pyplot(server, raise_if_not_available=True)
        polycoll = get_polycollection(server, raise_if_not_available=True)
        if not can_use_numpy():
            raise ImportError("numpy is required for Infogram.")
        import numpy as np

        if train:
            rel_cmi_frame = self.get_admissible_score_frame()
            if rel_cmi_frame is None:
                raise H2OValueError("Cannot locate the H2OFrame containing the infogram data from training dataset.")
        if valid:
            rel_cmi_frame_valid = self.get_admissible_score_frame(valid=True)
            if rel_cmi_frame_valid is None:
                raise H2OValueError("Cannot locate the H2OFrame containing the infogram data from validation dataset.")
        if xval:
            rel_cmi_frame_xval = self.get_admissible_score_frame(xval=True)
            if rel_cmi_frame_xval is None:
                raise H2OValueError("Cannot locate the H2OFrame containing the infogram data from xval holdout dataset.")

        rel_cmi_frame_names = rel_cmi_frame.names
        x_label = rel_cmi_frame_names[3]
        y_label = rel_cmi_frame_names[4]
        ig_x_column = 3
        ig_y_column = 4
        index_of_admissible = 1
        features_column = 0
        if self.actual_params['protected_columns'] == None:
            x_thresh = self.actual_params['total_information_threshold']
            y_thresh = self.actual_params['net_information_threshold']
        else:
            x_thresh = self.actual_params["relevance_index_threshold"]
            y_thresh = self.actual_params["safety_index_threshold"]

        xmax=1.1
        ymax=1.1

        X = np.array(rel_cmi_frame[ig_x_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
        Y = np.array(rel_cmi_frame[ig_y_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
        features = np.array(rel_cmi_frame[features_column].as_data_frame(header=False, use_pandas=False)).reshape((-1,))
        admissible = np.array(rel_cmi_frame[index_of_admissible].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
        mask = admissible > 0

        if valid:
            X_valid = np.array(rel_cmi_frame_valid[ig_x_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
            Y_valid = np.array(rel_cmi_frame_valid[ig_y_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
            features_valid = np.array(rel_cmi_frame_valid[features_column].as_data_frame(header=False, use_pandas=False)).reshape((-1,))
            admissible_valid = np.array(rel_cmi_frame_valid[index_of_admissible].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
            mask_valid = admissible_valid > 0       

        if xval:
            X_xval = np.array(rel_cmi_frame_xval[ig_x_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
            Y_xval = np.array(rel_cmi_frame_xval[ig_y_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
            features_xval = np.array(rel_cmi_frame_xval[features_column].as_data_frame(header=False, use_pandas=False)).reshape((-1,))
            admissible_xval = np.array(rel_cmi_frame_xval[index_of_admissible].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,))
            mask_xval = admissible_xval > 0

        plt.figure(figsize=figsize)
        plt.grid(True)
        plt.scatter(X, Y, zorder=10, c=np.where(mask, "black", "gray"), label="training data")
        if valid:
            plt.scatter(X_valid, Y_valid, zorder=10, marker=",", c=np.where(mask_valid, "black", "gray"), label="validation data")
        if xval:
            plt.scatter(X_xval, Y_xval, zorder=10, marker="v", c=np.where(mask_xval, "black", "gray"), label="xval holdout data")
        if legend_on:
            plt.legend(loc=2, fancybox=True, framealpha=0.5)
        plt.hlines(y_thresh, xmin=x_thresh, xmax=xmax, colors="red", linestyle="dashed")
        plt.vlines(x_thresh, ymin=y_thresh, ymax=ymax, colors="red", linestyle="dashed")
        plt.gca().add_collection(polycoll(verts=[[(0,0), (0, ymax), (x_thresh, ymax), (x_thresh, y_thresh), (xmax, y_thresh), (xmax, 0)]],
                                                color="#CC663E", alpha=0.1, zorder=5))

        for i in mask.nonzero()[0]:
            plt.annotate(features[i], (X[i], Y[i]), xytext=(0, -10), textcoords="offset points",
                         horizontalalignment='center', verticalalignment='top', color="blue")

        if valid:
            for i in mask_valid.nonzero()[0]:
                plt.annotate(features_valid[i], (X_valid[i], Y_valid[i]), xytext=(0, -10), textcoords="offset points",
                             horizontalalignment='center', verticalalignment='top', color="magenta")

        if xval:
            for i in mask_xval.nonzero()[0]:
                plt.annotate(features_xval[i], (X_xval[i], Y_xval[i]), xytext=(0, -10), textcoords="offset points",
                             horizontalalignment='center', verticalalignment='top', color="green")

        plt.xlim(0, 1.05)
        plt.ylim(0, 1.05)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.title(title)
        fig = plt.gcf()
        if not server:
            plt.show()
        return decorate_plot_result(figure=fig)

[docs]    def get_admissible_score_frame(self, valid=False, xval=False):
        """
        Retreive admissible score frame which includes relevance and CMI information in an H2OFrame for training dataset by default
        :param valid: return infogram info on validation dataset if True
        :param xval: return infogram info on cross-validation hold outs if True
        :return: H2OFrame
        """
        keyString = self._model_json["output"]["admissible_score_key"]
        if (valid):
            keyString = self._model_json["output"]["admissible_score_key_valid"]
        elif (xval):
            keyString = self._model_json["output"]["admissible_score_key_xval"]

        if keyString is None:
            return None
        else:
            return h2o.get_frame(keyString['name'])

[docs]    def get_admissible_features(self):
        """
        :return: a list of predictor that are considered admissible
        """
        if self._model_json["output"]["admissible_features"] is None:
            return None
        else:
            return self._model_json["output"]["admissible_features"]

[docs]    def get_admissible_relevance(self):
        """
        :return: a list of relevance (variable importance) for admissible attributes
        """
        if self._model_json["output"]["admissible_relevance"] is None:
            return None
        else:
            return self._model_json["output"]["admissible_relevance"]

[docs]    def get_admissible_cmi(self):
        """
        :return: a list of the normalized CMI of admissible attributes
        """
        if self._model_json["output"]["admissible_cmi"] is None:
            return None
        else:
            return self._model_json["output"]["admissible_cmi"]

[docs]    def get_admissible_cmi_raw(self):
        """
        :return: a list of raw cmi of admissible attributes 
        """
        if self._model_json["output"]["admissible_cmi_raw"] is None:
            return None
        else:
            return self._model_json["output"]["admissible_cmi_raw"]

[docs]    def get_all_predictor_relevance(self):
        """
        Get relevance of all predictors
        :return: two tuples, first one is predictor names and second one is relevance
        """
        if self._model_json["output"]["all_predictor_names"] is None:
            return None
        else:
            return self._model_json["output"]["all_predictor_names"], self._model_json["output"]["relevance"]

[docs]    def get_all_predictor_cmi(self):
        """
        Get normalized CMI of all predictors.
        :return: two tuples, first one is predictor names and second one is cmi
        """
        if self._model_json["output"]["all_predictor_names"] is None:
            return None
        else:
            return self._model_json["output"]["all_predictor_names"], self._model_json["output"]["cmi"]

[docs]    def get_all_predictor_cmi_raw(self):
        """
        Get raw CMI of all predictors.
        :return: two tuples, first one is predictor names and second one is cmi
        """
        if self._model_json["output"]["all_predictor_names"] is None:
            return None
        else:
            return self._model_json["output"]["all_predictor_names"], self._model_json["output"]["cmi_raw"]

    # Override train method to support infogram needs
[docs]    def train(self, x=None, y=None, training_frame=None, verbose=False, **kwargs):
        sup = super(self.__class__, self)

        def extend_parms(parms):  # add parameter checks specific to infogram
            if parms["data_fraction"] is not None:
                assert_is_type(parms["data_fraction"], numeric)
                assert parms["data_fraction"] > 0 and parms["data_fraction"] <= 1, "data_fraction should exceed 0" \
                                                                                   " and <= 1."

        parms = sup._make_parms(x,y,training_frame, extend_parms_fn = extend_parms, **kwargs)

        sup._train(parms, verbose=verbose)
        # can probably get rid of model attributes that Erin does not want here
        return self

    @staticmethod
    def _train_and_get_models(model_class, x, y, train, **kwargs):
        from h2o.automl import H2OAutoML
        from h2o.grid import H2OGridSearch

        model = model_class(**kwargs)
        model.train(x, y, train)
        if model_class is H2OAutoML:
            return [h2o.get_model(m[0]) for m in model.leaderboard["model_id"].as_data_frame(False, False)]
        elif model_class is H2OGridSearch:
            return [h2o.get_model(m) for m in model.model_ids]
        else:
            return [model]


[docs]    def train_subset_models(self, model_class, y, training_frame, test_frame, protected_columns=None, reference=None,
                            favorable_class=None, feature_selection_metrics=None, metric="euclidean", **kwargs):
        """
        Train models using different feature subsets selected by infogram.

        :param model_class: H2O Estimator class, H2OAutoML, or H2OGridSearch
        :param y: response column
        :param training_frame: training frame
        :param test_frame: test frame
        :param protected_columns: List of categorical columns that contain sensitive information
                                  such as race, gender, age etc.
        :param reference: List of values corresponding to a reference for each protected columns.
                          If set to ``None``, it will use the biggest group as the reference.
        :param favorable_class: Positive/favorable outcome class of the response.
        :param feature_selection_metrics: column names from infogram's admissible score frame that are used
                                          for the feature subset selection. Defaults to ``safety_index`` for fair infogram
                                          and ``admissible_index`` for the core infogram.
        :param metric: metric to combine information from the columns specified in feature_selection_metrics. Can be one
                       of "euclidean", "manhattan", "maximum", or a function with that takes the admissible score frame
                       and feature_selection_metrics and produces a single column.
        :param kwargs: Arguments passed to the constructor of the model_class
        :return: H2OFrame

        :examples:
        >>> from h2o.estimators import H2OGradientBoostingEstimator, H2OInfogram
        >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/admissibleml_test/taiwan_credit_card_uci.csv")
        >>> x = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
        >>>      'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
        >>> y = "default payment next month"
        >>> protected_columns = ['SEX', 'EDUCATION']
        >>>
        >>> for c in [y] + protected_columns:
        >>>     data[c] = data[c].asfactor()
        >>>
        >>> train, test = data.split_frame([0.8])
        >>>
        >>> reference = ["1", "2"]  # university educated single man
        >>> favorable_class = "0"  # no default next month
        >>>
        >>> ig = H2OInfogram(protected_columns=protected_columns)
        >>> ig.train(x, y, training_frame=train)
        >>>
        >>> ig.train_subset_models(H2OGradientBoostingEstimator, y, train, test, protected_columns, reference, favorable_class)
        """
        from h2o import H2OFrame, make_leaderboard
        from h2o.explanation import disparate_analysis
        from h2o.utils.typechecks import assert_is_type

        assert hasattr(model_class, "train")
        assert_is_type(y, str)
        assert_is_type(training_frame, H2OFrame)

        score = self.get_admissible_score_frame()
        if feature_selection_metrics is None:
            if "safety_index" in score.columns:
                feature_selection_metrics = ["safety_index"]
            else:
                feature_selection_metrics = ["admissible_index"]

        for fs_col in feature_selection_metrics:
            if fs_col not in score.columns:
                raise ValueError("Column '{}' is not present in the admissible score frame.".format(fs_col))

        metrics = dict(
            euclidean=lambda fr, fs_metrics: (fr[:, fs_metrics]**2).sum(axis=1).sqrt(),
            manhattan=lambda fr, fs_metrics: fr[:, fs_metrics].abs().sum(axis=1),
            maximum=lambda fr, fs_metrics: fr[:, fs_metrics].apply(lambda row: row.max(), axis=1),
        )

        metric_fn = metric
        if not callable(metric) and metric.lower() not in metrics.keys():
            raise ValueError("Metric '{}' is not supported!".format(metric.lower()))
        if not callable(metric):
            metric_fn = metrics.get(metric.lower())
        if len(feature_selection_metrics) == 1:
            score["sort_metric"] = score[:, feature_selection_metrics] # sum(.., axis=1) does work weird for single column -> sums it to one number
        else:
            score["sort_metric"] = metric_fn(score, feature_selection_metrics)
        score = score.sort("sort_metric", False)
        cols = [x[0] for x in score["column"].as_data_frame(False, False)]
        subsets = [cols[0:i] for i in range(1, len(cols)+1)]
        models = []
        for x in subsets:
            models.extend(self._train_and_get_models(model_class, x, y, training_frame, **kwargs))

        if protected_columns is None or len(protected_columns) == 0:
            return make_leaderboard(models, leaderboard_frame=test_frame)
        return disparate_analysis(models, test_frame, protected_columns, reference, favorable_class)