Source code for h2o.estimators.infogram

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details)
#

import ast
import json
import warnings
import h2o
from h2o.utils.shared_utils import can_use_numpy
from h2o.utils.typechecks import is_type
from h2o.plot import get_matplotlib_pyplot, decorate_plot_result, get_polycollection
from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric


[docs]class H2OInfogram(H2OEstimator): """ Information Diagram The infogram is a graphical information-theoretic interpretability tool which allows the user to quickly spot the core, decision-making variables that uniquely and safely drive the response, in supervised classification problems. The infogram can significantly cut down the number of predictors needed to build a model by identifying only the most valuable, admissible features. When protected variables such as race or gender are present in the data, the admissibility of a variable is determined by a safety and relevancy index, and thus serves as a diagnostic tool for fairness. The safety of each feature can be quantified and variables that are unsafe will be considered inadmissible. Models built using only admissible features will naturally be more interpretable, given the reduced feature set. Admissible models are also less susceptible to overfitting and train faster, while providing similar accuracy as models built using all available features. """ algo = "infogram" supervised_learning = True def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] seed=-1, # type: int keep_cross_validation_models=True, # type: bool keep_cross_validation_predictions=False, # type: bool keep_cross_validation_fold_assignment=False, # type: bool nfolds=0, # type: int fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"] fold_column=None, # type: Optional[str] response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] ignore_const_cols=True, # type: bool score_each_iteration=False, # type: bool offset_column=None, # type: Optional[str] weights_column=None, # type: Optional[str] standardize=False, # type: bool distribution="auto", # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] plug_values=None, # type: Optional[Union[None, str, H2OFrame]] max_iterations=0, # type: int stopping_rounds=0, # type: int stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] stopping_tolerance=0.001, # type: float balance_classes=False, # type: bool class_sampling_factors=None, # type: Optional[List[float]] max_after_balance_size=5.0, # type: float max_runtime_secs=0.0, # type: float custom_metric_func=None, # type: Optional[str] auc_type="auto", # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] algorithm="auto", # type: Literal["auto", "deeplearning", "drf", "gbm", "glm", "xgboost"] algorithm_params=None, # type: Optional[dict] protected_columns=None, # type: Optional[List[str]] total_information_threshold=-1.0, # type: float net_information_threshold=-1.0, # type: float relevance_index_threshold=-1.0, # type: float safety_index_threshold=-1.0, # type: float data_fraction=1.0, # type: float top_n_features=50, # type: int ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param seed: Seed for pseudo random number generator (if applicable). Defaults to ``-1``. :type seed: int :param keep_cross_validation_models: Whether to keep the cross-validation models. Defaults to ``True``. :type keep_cross_validation_models: bool :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models. Defaults to ``False``. :type keep_cross_validation_predictions: bool :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment. Defaults to ``False``. :type keep_cross_validation_fold_assignment: bool :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2). Defaults to ``0``. :type nfolds: int :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Defaults to ``"auto"``. :type fold_assignment: Literal["auto", "random", "modulo", "stratified"] :param fold_column: Column with cross-validation fold index assignment per observation. Defaults to ``None``. :type fold_column: str, optional :param response_column: Response variable column. Defaults to ``None``. :type response_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool :param score_each_iteration: Whether to score during each iteration of model training. Defaults to ``False``. :type score_each_iteration: bool :param offset_column: Offset column. This will be added to the combination of columns before applying the link function. Defaults to ``None``. :type offset_column: str, optional :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Defaults to ``None``. :type weights_column: str, optional :param standardize: Standardize numeric columns to have zero mean and unit variance. Defaults to ``False``. :type standardize: bool :param distribution: Distribution function Defaults to ``"auto"``. :type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"] :param plug_values: Plug Values (a single row frame containing values that will be used to impute missing values of the training/validation frame, use with conjunction missing_values_handling = PlugValues). Defaults to ``None``. :type plug_values: Union[None, str, H2OFrame], optional :param max_iterations: Maximum number of iterations. Defaults to ``0``. :type max_iterations: int :param stopping_rounds: Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Defaults to ``0``. :type stopping_rounds: int :param stopping_metric: Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Defaults to ``"auto"``. :type stopping_metric: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"] :param stopping_tolerance: Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Defaults to ``0.001``. :type stopping_tolerance: float :param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to ``False``. :type balance_classes: bool :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. Defaults to ``None``. :type class_sampling_factors: List[float], optional :param max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Defaults to ``5.0``. :type max_after_balance_size: float :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName` Defaults to ``None``. :type custom_metric_func: str, optional :param auc_type: Set default multinomial AUC type. Defaults to ``"auto"``. :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"] :param algorithm: Type of machine learning algorithm used to build the infogram. Options include 'AUTO' (gbm), 'deeplearning' (Deep Learning with default parameters), 'drf' (Random Forest with default parameters), 'gbm' (GBM with default parameters), 'glm' (GLM with default parameters), or 'xgboost' (if available, XGBoost with default parameters). Defaults to ``"auto"``. :type algorithm: Literal["auto", "deeplearning", "drf", "gbm", "glm", "xgboost"] :param algorithm_params: Customized parameters for the machine learning algorithm specified in the algorithm parameter. Defaults to ``None``. :type algorithm_params: dict, optional :param protected_columns: Columns that contain features that are sensitive and need to be protected (legally, or otherwise), if applicable. These features (e.g. race, gender, etc) should not drive the prediction of the response. Defaults to ``None``. :type protected_columns: List[str], optional :param total_information_threshold: A number between 0 and 1 representing a threshold for total information, defaulting to 0.1. For a specific feature, if the total information is higher than this threshold, and the corresponding net information is also higher than the threshold ``net_information_threshold``, that feature will be considered admissible. The total information is the x-axis of the Core Infogram. Default is -1 which gets set to 0.1. Defaults to ``-1.0``. :type total_information_threshold: float :param net_information_threshold: A number between 0 and 1 representing a threshold for net information, defaulting to 0.1. For a specific feature, if the net information is higher than this threshold, and the corresponding total information is also higher than the total_information_threshold, that feature will be considered admissible. The net information is the y-axis of the Core Infogram. Default is -1 which gets set to 0.1. Defaults to ``-1.0``. :type net_information_threshold: float :param relevance_index_threshold: A number between 0 and 1 representing a threshold for the relevance index, defaulting to 0.1. This is only used when ``protected_columns`` is set by the user. For a specific feature, if the relevance index value is higher than this threshold, and the corresponding safety index is also higher than the safety_index_threshold``, that feature will be considered admissible. The relevance index is the x-axis of the Fair Infogram. Default is -1 which gets set to 0.1. Defaults to ``-1.0``. :type relevance_index_threshold: float :param safety_index_threshold: A number between 0 and 1 representing a threshold for the safety index, defaulting to 0.1. This is only used when protected_columns is set by the user. For a specific feature, if the safety index value is higher than this threshold, and the corresponding relevance index is also higher than the relevance_index_threshold, that feature will be considered admissible. The safety index is the y-axis of the Fair Infogram. Default is -1 which gets set to 0.1. Defaults to ``-1.0``. :type safety_index_threshold: float :param data_fraction: The fraction of training frame to use to build the infogram model. Defaults to 1.0, and any value greater than 0 and less than or equal to 1.0 is acceptable. Defaults to ``1.0``. :type data_fraction: float :param top_n_features: An integer specifying the number of columns to evaluate in the infogram. The columns are ranked by variable importance, and the top N are evaluated. Defaults to 50. Defaults to ``50``. :type top_n_features: int """ super(H2OInfogram, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.seed = seed self.keep_cross_validation_models = keep_cross_validation_models self.keep_cross_validation_predictions = keep_cross_validation_predictions self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment self.nfolds = nfolds self.fold_assignment = fold_assignment self.fold_column = fold_column self.response_column = response_column self.ignored_columns = ignored_columns self.ignore_const_cols = ignore_const_cols self.score_each_iteration = score_each_iteration self.offset_column = offset_column self.weights_column = weights_column self.standardize = standardize self.distribution = distribution self.plug_values = plug_values self.max_iterations = max_iterations self.stopping_rounds = stopping_rounds self.stopping_metric = stopping_metric self.stopping_tolerance = stopping_tolerance self.balance_classes = balance_classes self.class_sampling_factors = class_sampling_factors self.max_after_balance_size = max_after_balance_size self.max_runtime_secs = max_runtime_secs self.custom_metric_func = custom_metric_func self.auc_type = auc_type self.algorithm = algorithm self.algorithm_params = algorithm_params self.protected_columns = protected_columns self.total_information_threshold = total_information_threshold self.net_information_threshold = net_information_threshold self.relevance_index_threshold = relevance_index_threshold self.safety_index_threshold = safety_index_threshold self.data_fraction = data_fraction self.top_n_features = top_n_features self._parms["_rest_version"] = 3 @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def seed(self): """ Seed for pseudo random number generator (if applicable). Type: ``int``, defaults to ``-1``. """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def keep_cross_validation_models(self): """ Whether to keep the cross-validation models. Type: ``bool``, defaults to ``True``. """ return self._parms.get("keep_cross_validation_models") @keep_cross_validation_models.setter def keep_cross_validation_models(self, keep_cross_validation_models): assert_is_type(keep_cross_validation_models, None, bool) self._parms["keep_cross_validation_models"] = keep_cross_validation_models @property def keep_cross_validation_predictions(self): """ Whether to keep the predictions of the cross-validation models. Type: ``bool``, defaults to ``False``. """ return self._parms.get("keep_cross_validation_predictions") @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, keep_cross_validation_predictions): assert_is_type(keep_cross_validation_predictions, None, bool) self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions @property def keep_cross_validation_fold_assignment(self): """ Whether to keep the cross-validation fold assignment. Type: ``bool``, defaults to ``False``. """ return self._parms.get("keep_cross_validation_fold_assignment") @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment): assert_is_type(keep_cross_validation_fold_assignment, None, bool) self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment @property def nfolds(self): """ Number of folds for K-fold cross-validation (0 to disable or >= 2). Type: ``int``, defaults to ``0``. """ return self._parms.get("nfolds") @nfolds.setter def nfolds(self, nfolds): assert_is_type(nfolds, None, int) self._parms["nfolds"] = nfolds @property def fold_assignment(self): """ Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``. """ return self._parms.get("fold_assignment") @fold_assignment.setter def fold_assignment(self, fold_assignment): assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified")) self._parms["fold_assignment"] = fold_assignment @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def response_column(self): """ Response variable column. Type: ``str``. """ return self._parms.get("response_column") @response_column.setter def response_column(self, response_column): assert_is_type(response_column, None, str) self._parms["response_column"] = response_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool``, defaults to ``True``. """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool``, defaults to ``False``. """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def offset_column(self): """ Offset column. This will be added to the combination of columns before applying the link function. Type: ``str``. """ return self._parms.get("offset_column") @offset_column.setter def offset_column(self, offset_column): assert_is_type(offset_column, None, str) self._parms["offset_column"] = offset_column @property def weights_column(self): """ Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. Type: ``str``. """ return self._parms.get("weights_column") @weights_column.setter def weights_column(self, weights_column): assert_is_type(weights_column, None, str) self._parms["weights_column"] = weights_column @property def standardize(self): """ Standardize numeric columns to have zero mean and unit variance. Type: ``bool``, defaults to ``False``. """ return self._parms.get("standardize") @standardize.setter def standardize(self, standardize): assert_is_type(standardize, None, bool) self._parms["standardize"] = standardize @property def distribution(self): """ Distribution function Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]``, defaults to ``"auto"``. """ return self._parms.get("distribution") @distribution.setter def distribution(self, distribution): assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber")) self._parms["distribution"] = distribution @property def plug_values(self): """ Plug Values (a single row frame containing values that will be used to impute missing values of the training/validation frame, use with conjunction missing_values_handling = PlugValues). Type: ``Union[None, str, H2OFrame]``. """ return self._parms.get("plug_values") @plug_values.setter def plug_values(self, plug_values): self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values') @property def max_iterations(self): """ Maximum number of iterations. Type: ``int``, defaults to ``0``. """ return self._parms.get("max_iterations") @max_iterations.setter def max_iterations(self, max_iterations): assert_is_type(max_iterations, None, int) self._parms["max_iterations"] = max_iterations @property def stopping_rounds(self): """ Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Type: ``int``, defaults to ``0``. """ return self._parms.get("stopping_rounds") @stopping_rounds.setter def stopping_rounds(self, stopping_rounds): assert_is_type(stopping_rounds, None, int) self._parms["stopping_rounds"] = stopping_rounds @property def stopping_metric(self): """ Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python client. Type: ``Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]``, defaults to ``"auto"``. """ return self._parms.get("stopping_metric") @stopping_metric.setter def stopping_metric(self, stopping_metric): assert_is_type(stopping_metric, None, Enum("auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing")) self._parms["stopping_metric"] = stopping_metric @property def stopping_tolerance(self): """ Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this much) Type: ``float``, defaults to ``0.001``. """ return self._parms.get("stopping_tolerance") @stopping_tolerance.setter def stopping_tolerance(self, stopping_tolerance): assert_is_type(stopping_tolerance, None, numeric) self._parms["stopping_tolerance"] = stopping_tolerance @property def balance_classes(self): """ Balance training data class counts via over/under-sampling (for imbalanced data). Type: ``bool``, defaults to ``False``. """ return self._parms.get("balance_classes") @balance_classes.setter def balance_classes(self, balance_classes): assert_is_type(balance_classes, None, bool) self._parms["balance_classes"] = balance_classes @property def class_sampling_factors(self): """ Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires balance_classes. Type: ``List[float]``. """ return self._parms.get("class_sampling_factors") @class_sampling_factors.setter def class_sampling_factors(self, class_sampling_factors): assert_is_type(class_sampling_factors, None, [float]) self._parms["class_sampling_factors"] = class_sampling_factors @property def max_after_balance_size(self): """ Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires balance_classes. Type: ``float``, defaults to ``5.0``. """ return self._parms.get("max_after_balance_size") @max_after_balance_size.setter def max_after_balance_size(self, max_after_balance_size): assert_is_type(max_after_balance_size, None, float) self._parms["max_after_balance_size"] = max_after_balance_size @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def custom_metric_func(self): """ Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. """ return self._parms.get("custom_metric_func") @custom_metric_func.setter def custom_metric_func(self, custom_metric_func): assert_is_type(custom_metric_func, None, str) self._parms["custom_metric_func"] = custom_metric_func @property def auc_type(self): """ Set default multinomial AUC type. Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to ``"auto"``. """ return self._parms.get("auc_type") @auc_type.setter def auc_type(self, auc_type): assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo")) self._parms["auc_type"] = auc_type @property def algorithm(self): """ Type of machine learning algorithm used to build the infogram. Options include 'AUTO' (gbm), 'deeplearning' (Deep Learning with default parameters), 'drf' (Random Forest with default parameters), 'gbm' (GBM with default parameters), 'glm' (GLM with default parameters), or 'xgboost' (if available, XGBoost with default parameters). Type: ``Literal["auto", "deeplearning", "drf", "gbm", "glm", "xgboost"]``, defaults to ``"auto"``. """ return self._parms.get("algorithm") @algorithm.setter def algorithm(self, algorithm): assert_is_type(algorithm, None, Enum("auto", "deeplearning", "drf", "gbm", "glm", "xgboost")) self._parms["algorithm"] = algorithm @property def algorithm_params(self): """ Customized parameters for the machine learning algorithm specified in the algorithm parameter. Type: ``dict``. """ if self._parms.get("algorithm_params") != None: algorithm_params_dict = ast.literal_eval(self._parms.get("algorithm_params")) for k in algorithm_params_dict: if len(algorithm_params_dict[k]) == 1: #single parameter algorithm_params_dict[k] = algorithm_params_dict[k][0] return algorithm_params_dict else: return self._parms.get("algorithm_params") @algorithm_params.setter def algorithm_params(self, algorithm_params): assert_is_type(algorithm_params, None, dict) if algorithm_params is not None and algorithm_params != "": for k in algorithm_params: if ("[" and "]") not in str(algorithm_params[k]): algorithm_params[k] = [algorithm_params[k]] self._parms["algorithm_params"] = str(json.dumps(algorithm_params)) else: self._parms["algorithm_params"] = None @property def protected_columns(self): """ Columns that contain features that are sensitive and need to be protected (legally, or otherwise), if applicable. These features (e.g. race, gender, etc) should not drive the prediction of the response. Type: ``List[str]``. """ return self._parms.get("protected_columns") @protected_columns.setter def protected_columns(self, protected_columns): assert_is_type(protected_columns, None, [str]) self._parms["protected_columns"] = protected_columns @property def total_information_threshold(self): """ A number between 0 and 1 representing a threshold for total information, defaulting to 0.1. For a specific feature, if the total information is higher than this threshold, and the corresponding net information is also higher than the threshold ``net_information_threshold``, that feature will be considered admissible. The total information is the x-axis of the Core Infogram. Default is -1 which gets set to 0.1. Type: ``float``, defaults to ``-1.0``. """ return self._parms.get("total_information_threshold") @total_information_threshold.setter def total_information_threshold(self, total_information_threshold): if total_information_threshold <= -1: # not set if self._parms["protected_columns"] is None: self._parms["total_information_threshold"] = 0.1 else: if self._parms["protected_columns"] is not None: # fair infogram warnings.warn("Should not set total_information_threshold for fair infogram runs. Set relevance_index_threshold instead. Using default of 0.1 if not set", RuntimeWarning) else: self._parms["total_information_threshold"] = total_information_threshold @property def net_information_threshold(self): """ A number between 0 and 1 representing a threshold for net information, defaulting to 0.1. For a specific feature, if the net information is higher than this threshold, and the corresponding total information is also higher than the total_information_threshold, that feature will be considered admissible. The net information is the y-axis of the Core Infogram. Default is -1 which gets set to 0.1. Type: ``float``, defaults to ``-1.0``. """ return self._parms.get("net_information_threshold") @net_information_threshold.setter def net_information_threshold(self, net_information_threshold): if net_information_threshold <= -1: # not set if self._parms["protected_columns"] is None: self._parms["net_information_threshold"]=0.1 else: # set if self._parms["protected_columns"] is not None: # fair infogram warnings.warn("Should not set net_information_threshold for fair infogram runs. Set safety_index_threshold instead. Using default of 0.1 if not set", RuntimeWarning) else: self._parms["net_information_threshold"]=net_information_threshold @property def relevance_index_threshold(self): """ A number between 0 and 1 representing a threshold for the relevance index, defaulting to 0.1. This is only used when ``protected_columns`` is set by the user. For a specific feature, if the relevance index value is higher than this threshold, and the corresponding safety index is also higher than the safety_index_threshold``, that feature will be considered admissible. The relevance index is the x-axis of the Fair Infogram. Default is -1 which gets set to 0.1. Type: ``float``, defaults to ``-1.0``. """ return self._parms.get("relevance_index_threshold") @relevance_index_threshold.setter def relevance_index_threshold(self, relevance_index_threshold): if relevance_index_threshold <= -1: # not set if self._parms["protected_columns"] is not None: # fair infogram self._parms["relevance_index_threshold"]=0.1 else: # it is set if self._parms["protected_columns"] is not None: # fair infogram self._parms["relevance_index_threshold"] = relevance_index_threshold else: # core infogram should not have been set warnings.warn("Should not set relevance_index_threshold for core infogram runs. Set total_information_threshold instead. Using default of 0.1 if not set", RuntimeWarning) @property def safety_index_threshold(self): """ A number between 0 and 1 representing a threshold for the safety index, defaulting to 0.1. This is only used when protected_columns is set by the user. For a specific feature, if the safety index value is higher than this threshold, and the corresponding relevance index is also higher than the relevance_index_threshold, that feature will be considered admissible. The safety index is the y-axis of the Fair Infogram. Default is -1 which gets set to 0.1. Type: ``float``, defaults to ``-1.0``. """ return self._parms.get("safety_index_threshold") @safety_index_threshold.setter def safety_index_threshold(self, safety_index_threshold): if safety_index_threshold <= -1: # not set if self._parms["protected_columns"] is not None: self._parms["safety_index_threshold"]=0.1 else: # it is set if self._parms["protected_columns"] is not None: # fair infogram self._parms["safety_index_threshold"] = safety_index_threshold else: # core infogram should not have been set warnings.warn("Should not set safety_index_threshold for core infogram runs. Set net_information_threshold instead. Using default of 0.1 if not set", RuntimeWarning) @property def data_fraction(self): """ The fraction of training frame to use to build the infogram model. Defaults to 1.0, and any value greater than 0 and less than or equal to 1.0 is acceptable. Type: ``float``, defaults to ``1.0``. """ return self._parms.get("data_fraction") @data_fraction.setter def data_fraction(self, data_fraction): assert_is_type(data_fraction, None, numeric) self._parms["data_fraction"] = data_fraction @property def top_n_features(self): """ An integer specifying the number of columns to evaluate in the infogram. The columns are ranked by variable importance, and the top N are evaluated. Defaults to 50. Type: ``int``, defaults to ``50``. """ return self._parms.get("top_n_features") @top_n_features.setter def top_n_features(self, top_n_features): assert_is_type(top_n_features, None, int) self._parms["top_n_features"] = top_n_features def _extract_x_from_model(self): """ extract admissible features from an Infogram model. :return: List of predictors that are considered admissible """ features = self._model_json.get('output', {}).get('admissible_features') if features is None: raise ValueError("model %s doesn't have any admissible features" % self.key) return set(features)
[docs] def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False): """ Plot the infogram. By default, it will plot the infogram calculated from training dataset. Note that the frame rel_cmi_frame contains the following columns: - 0: predictor names - 1: admissible - 2: admissible index - 3: relevance-index or total information - 4: safety-index or net information, normalized from 0 to 1 - 5: safety-index or net information not normalized :param train: True if infogram is generated from training dataset :param valid: True if infogram is generated from validation dataset :param xval: True if infogram is generated from cross-validation holdout dataset :param figsize: size of infogram plot :param title: string to denote title of the plot :param legend_on: legend text is included if True :param server: True will not generate plot, False will produce plot :return: infogram plot if server=True or None if server=False """ plt = get_matplotlib_pyplot(server, raise_if_not_available=True) polycoll = get_polycollection(server, raise_if_not_available=True) if not can_use_numpy(): raise ImportError("numpy is required for Infogram.") import numpy as np if train: rel_cmi_frame = self.get_admissible_score_frame() if rel_cmi_frame is None: raise H2OValueError("Cannot locate the H2OFrame containing the infogram data from training dataset.") if valid: rel_cmi_frame_valid = self.get_admissible_score_frame(valid=True) if rel_cmi_frame_valid is None: raise H2OValueError("Cannot locate the H2OFrame containing the infogram data from validation dataset.") if xval: rel_cmi_frame_xval = self.get_admissible_score_frame(xval=True) if rel_cmi_frame_xval is None: raise H2OValueError("Cannot locate the H2OFrame containing the infogram data from xval holdout dataset.") rel_cmi_frame_names = rel_cmi_frame.names x_label = rel_cmi_frame_names[3] y_label = rel_cmi_frame_names[4] ig_x_column = 3 ig_y_column = 4 index_of_admissible = 1 features_column = 0 if self.actual_params['protected_columns'] == None: x_thresh = self.actual_params['total_information_threshold'] y_thresh = self.actual_params['net_information_threshold'] else: x_thresh = self.actual_params["relevance_index_threshold"] y_thresh = self.actual_params["safety_index_threshold"] xmax=1.1 ymax=1.1 X = np.array(rel_cmi_frame[ig_x_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) Y = np.array(rel_cmi_frame[ig_y_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) features = np.array(rel_cmi_frame[features_column].as_data_frame(header=False, use_pandas=False)).reshape((-1,)) admissible = np.array(rel_cmi_frame[index_of_admissible].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) mask = admissible > 0 if valid: X_valid = np.array(rel_cmi_frame_valid[ig_x_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) Y_valid = np.array(rel_cmi_frame_valid[ig_y_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) features_valid = np.array(rel_cmi_frame_valid[features_column].as_data_frame(header=False, use_pandas=False)).reshape((-1,)) admissible_valid = np.array(rel_cmi_frame_valid[index_of_admissible].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) mask_valid = admissible_valid > 0 if xval: X_xval = np.array(rel_cmi_frame_xval[ig_x_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) Y_xval = np.array(rel_cmi_frame_xval[ig_y_column].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) features_xval = np.array(rel_cmi_frame_xval[features_column].as_data_frame(header=False, use_pandas=False)).reshape((-1,)) admissible_xval = np.array(rel_cmi_frame_xval[index_of_admissible].as_data_frame(header=False, use_pandas=False)).astype(float).reshape((-1,)) mask_xval = admissible_xval > 0 plt.figure(figsize=figsize) plt.grid(True) plt.scatter(X, Y, zorder=10, c=np.where(mask, "black", "gray"), label="training data") if valid: plt.scatter(X_valid, Y_valid, zorder=10, marker=",", c=np.where(mask_valid, "black", "gray"), label="validation data") if xval: plt.scatter(X_xval, Y_xval, zorder=10, marker="v", c=np.where(mask_xval, "black", "gray"), label="xval holdout data") if legend_on: plt.legend(loc=2, fancybox=True, framealpha=0.5) plt.hlines(y_thresh, xmin=x_thresh, xmax=xmax, colors="red", linestyle="dashed") plt.vlines(x_thresh, ymin=y_thresh, ymax=ymax, colors="red", linestyle="dashed") plt.gca().add_collection(polycoll(verts=[[(0,0), (0, ymax), (x_thresh, ymax), (x_thresh, y_thresh), (xmax, y_thresh), (xmax, 0)]], color="#CC663E", alpha=0.1, zorder=5)) for i in mask.nonzero()[0]: plt.annotate(features[i], (X[i], Y[i]), xytext=(0, -10), textcoords="offset points", horizontalalignment='center', verticalalignment='top', color="blue") if valid: for i in mask_valid.nonzero()[0]: plt.annotate(features_valid[i], (X_valid[i], Y_valid[i]), xytext=(0, -10), textcoords="offset points", horizontalalignment='center', verticalalignment='top', color="magenta") if xval: for i in mask_xval.nonzero()[0]: plt.annotate(features_xval[i], (X_xval[i], Y_xval[i]), xytext=(0, -10), textcoords="offset points", horizontalalignment='center', verticalalignment='top', color="green") plt.xlim(0, 1.05) plt.ylim(0, 1.05) plt.xlabel(x_label) plt.ylabel(y_label) plt.title(title) fig = plt.gcf() if not server: plt.show() return decorate_plot_result(figure=fig)
[docs] def get_admissible_score_frame(self, valid=False, xval=False): """ Retreive admissible score frame which includes relevance and CMI information in an H2OFrame for training dataset by default :param valid: return infogram info on validation dataset if True :param xval: return infogram info on cross-validation hold outs if True :return: H2OFrame """ keyString = self._model_json["output"]["admissible_score_key"] if (valid): keyString = self._model_json["output"]["admissible_score_key_valid"] elif (xval): keyString = self._model_json["output"]["admissible_score_key_xval"] if keyString is None: return None else: return h2o.get_frame(keyString['name'])
[docs] def get_admissible_features(self): """ :return: a list of predictor that are considered admissible """ if self._model_json["output"]["admissible_features"] is None: return None else: return self._model_json["output"]["admissible_features"]
[docs] def get_admissible_relevance(self): """ :return: a list of relevance (variable importance) for admissible attributes """ if self._model_json["output"]["admissible_relevance"] is None: return None else: return self._model_json["output"]["admissible_relevance"]
[docs] def get_admissible_cmi(self): """ :return: a list of the normalized CMI of admissible attributes """ if self._model_json["output"]["admissible_cmi"] is None: return None else: return self._model_json["output"]["admissible_cmi"]
[docs] def get_admissible_cmi_raw(self): """ :return: a list of raw cmi of admissible attributes """ if self._model_json["output"]["admissible_cmi_raw"] is None: return None else: return self._model_json["output"]["admissible_cmi_raw"]
[docs] def get_all_predictor_relevance(self): """ Get relevance of all predictors :return: two tuples, first one is predictor names and second one is relevance """ if self._model_json["output"]["all_predictor_names"] is None: return None else: return self._model_json["output"]["all_predictor_names"], self._model_json["output"]["relevance"]
[docs] def get_all_predictor_cmi(self): """ Get normalized CMI of all predictors. :return: two tuples, first one is predictor names and second one is cmi """ if self._model_json["output"]["all_predictor_names"] is None: return None else: return self._model_json["output"]["all_predictor_names"], self._model_json["output"]["cmi"]
[docs] def get_all_predictor_cmi_raw(self): """ Get raw CMI of all predictors. :return: two tuples, first one is predictor names and second one is cmi """ if self._model_json["output"]["all_predictor_names"] is None: return None else: return self._model_json["output"]["all_predictor_names"], self._model_json["output"]["cmi_raw"]
# Override train method to support infogram needs
[docs] def train(self, x=None, y=None, training_frame=None, verbose=False, **kwargs): sup = super(self.__class__, self) def extend_parms(parms): # add parameter checks specific to infogram if parms["data_fraction"] is not None: assert_is_type(parms["data_fraction"], numeric) assert parms["data_fraction"] > 0 and parms["data_fraction"] <= 1, "data_fraction should exceed 0" \ " and <= 1." parms = sup._make_parms(x,y,training_frame, extend_parms_fn = extend_parms, **kwargs) sup._train(parms, verbose=verbose) # can probably get rid of model attributes that Erin does not want here return self
@staticmethod def _train_and_get_models(model_class, x, y, train, **kwargs): from h2o.automl import H2OAutoML from h2o.grid import H2OGridSearch model = model_class(**kwargs) model.train(x, y, train) if model_class is H2OAutoML: return [h2o.get_model(m[0]) for m in model.leaderboard["model_id"].as_data_frame(False, False)] elif model_class is H2OGridSearch: return [h2o.get_model(m) for m in model.model_ids] else: return [model]
[docs] def train_subset_models(self, model_class, y, training_frame, test_frame, protected_columns=None, reference=None, favorable_class=None, feature_selection_metrics=None, metric="euclidean", **kwargs): """ Train models using different feature subsets selected by infogram. :param model_class: H2O Estimator class, H2OAutoML, or H2OGridSearch :param y: response column :param training_frame: training frame :param test_frame: test frame :param protected_columns: List of categorical columns that contain sensitive information such as race, gender, age etc. :param reference: List of values corresponding to a reference for each protected columns. If set to ``None``, it will use the biggest group as the reference. :param favorable_class: Positive/favorable outcome class of the response. :param feature_selection_metrics: column names from infogram's admissible score frame that are used for the feature subset selection. Defaults to ``safety_index`` for fair infogram and ``admissible_index`` for the core infogram. :param metric: metric to combine information from the columns specified in feature_selection_metrics. Can be one of "euclidean", "manhattan", "maximum", or a function with that takes the admissible score frame and feature_selection_metrics and produces a single column. :param kwargs: Arguments passed to the constructor of the model_class :return: H2OFrame :examples: >>> from h2o.estimators import H2OGradientBoostingEstimator, H2OInfogram >>> data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/admissibleml_test/taiwan_credit_card_uci.csv") >>> x = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', >>> 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] >>> y = "default payment next month" >>> protected_columns = ['SEX', 'EDUCATION'] >>> >>> for c in [y] + protected_columns: >>> data[c] = data[c].asfactor() >>> >>> train, test = data.split_frame([0.8]) >>> >>> reference = ["1", "2"] # university educated single man >>> favorable_class = "0" # no default next month >>> >>> ig = H2OInfogram(protected_columns=protected_columns) >>> ig.train(x, y, training_frame=train) >>> >>> ig.train_subset_models(H2OGradientBoostingEstimator, y, train, test, protected_columns, reference, favorable_class) """ from h2o import H2OFrame, make_leaderboard from h2o.explanation import disparate_analysis from h2o.utils.typechecks import assert_is_type assert hasattr(model_class, "train") assert_is_type(y, str) assert_is_type(training_frame, H2OFrame) score = self.get_admissible_score_frame() if feature_selection_metrics is None: if "safety_index" in score.columns: feature_selection_metrics = ["safety_index"] else: feature_selection_metrics = ["admissible_index"] for fs_col in feature_selection_metrics: if fs_col not in score.columns: raise ValueError("Column '{}' is not present in the admissible score frame.".format(fs_col)) metrics = dict( euclidean=lambda fr, fs_metrics: (fr[:, fs_metrics]**2).sum(axis=1).sqrt(), manhattan=lambda fr, fs_metrics: fr[:, fs_metrics].abs().sum(axis=1), maximum=lambda fr, fs_metrics: fr[:, fs_metrics].apply(lambda row: row.max(), axis=1), ) metric_fn = metric if not callable(metric) and metric.lower() not in metrics.keys(): raise ValueError("Metric '{}' is not supported!".format(metric.lower())) if not callable(metric): metric_fn = metrics.get(metric.lower()) if len(feature_selection_metrics) == 1: score["sort_metric"] = score[:, feature_selection_metrics] # sum(.., axis=1) does work weird for single column -> sums it to one number else: score["sort_metric"] = metric_fn(score, feature_selection_metrics) score = score.sort("sort_metric", False) cols = [x[0] for x in score["column"].as_data_frame(False, False)] subsets = [cols[0:i] for i in range(1, len(cols)+1)] models = [] for x in subsets: models.extend(self._train_and_get_models(model_class, x, y, training_frame, **kwargs)) if protected_columns is None or len(protected_columns) == 0: return make_leaderboard(models, leaderboard_frame=test_frame) return disparate_analysis(models, test_frame, protected_columns, reference, favorable_class)