Source code for h2o.estimators.kmeans

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# This file is auto-generated by h2o-3/h2o-bindings/bin/
# Copyright 2016;  Apache License Version 2.0 (see LICENSE for details)

from h2o.estimators.estimator_base import H2OEstimator
from h2o.exceptions import H2OValueError
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type, Enum, numeric

[docs]class H2OKMeansEstimator(H2OEstimator): """ K-means Performs k-means clustering on an H2O dataset. """ algo = "kmeans" supervised_learning = False def __init__(self, model_id=None, # type: Optional[Union[None, str, H2OEstimator]] training_frame=None, # type: Optional[Union[None, str, H2OFrame]] validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] nfolds=0, # type: int keep_cross_validation_models=True, # type: bool keep_cross_validation_predictions=False, # type: bool keep_cross_validation_fold_assignment=False, # type: bool fold_assignment="auto", # type: Literal["auto", "random", "modulo", "stratified"] fold_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] ignore_const_cols=True, # type: bool score_each_iteration=False, # type: bool k=1, # type: int estimate_k=False, # type: bool user_points=None, # type: Optional[Union[None, str, H2OFrame]] max_iterations=10, # type: int standardize=True, # type: bool seed=-1, # type: int init="furthest", # type: Literal["random", "plus_plus", "furthest", "user"] max_runtime_secs=0.0, # type: float categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] export_checkpoints_dir=None, # type: Optional[str] cluster_size_constraints=None, # type: Optional[List[int]] ): """ :param model_id: Destination id for this model; auto-generated if not specified. Defaults to ``None``. :type model_id: Union[None, str, H2OEstimator], optional :param training_frame: Id of the training data frame. Defaults to ``None``. :type training_frame: Union[None, str, H2OFrame], optional :param validation_frame: Id of the validation data frame. Defaults to ``None``. :type validation_frame: Union[None, str, H2OFrame], optional :param nfolds: Number of folds for K-fold cross-validation (0 to disable or >= 2). Defaults to ``0``. :type nfolds: int :param keep_cross_validation_models: Whether to keep the cross-validation models. Defaults to ``True``. :type keep_cross_validation_models: bool :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation models. Defaults to ``False``. :type keep_cross_validation_predictions: bool :param keep_cross_validation_fold_assignment: Whether to keep the cross-validation fold assignment. Defaults to ``False``. :type keep_cross_validation_fold_assignment: bool :param fold_assignment: Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Defaults to ``"auto"``. :type fold_assignment: Literal["auto", "random", "modulo", "stratified"] :param fold_column: Column with cross-validation fold index assignment per observation. Defaults to ``None``. :type fold_column: str, optional :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool :param score_each_iteration: Whether to score during each iteration of model training. Defaults to ``False``. :type score_each_iteration: bool :param k: The max. number of clusters. If estimate_k is disabled, the model will find k centroids, otherwise it will find up to k centroids. Defaults to ``1``. :type k: int :param estimate_k: Whether to estimate the number of clusters (<=k) iteratively and deterministically. Defaults to ``False``. :type estimate_k: bool :param user_points: This option allows you to specify a dataframe, where each row represents an initial cluster center. The user-specified points must have the same number of columns as the training observations. The number of rows must equal the number of clusters Defaults to ``None``. :type user_points: Union[None, str, H2OFrame], optional :param max_iterations: Maximum training iterations (if estimate_k is enabled, then this is for each inner Lloyds iteration) Defaults to ``10``. :type max_iterations: int :param standardize: Standardize columns before computing distances Defaults to ``True``. :type standardize: bool :param seed: RNG Seed Defaults to ``-1``. :type seed: int :param init: Initialization mode Defaults to ``"furthest"``. :type init: Literal["random", "plus_plus", "furthest", "user"] :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to ``0.0``. :type max_runtime_secs: float :param categorical_encoding: Encoding scheme for categorical features Defaults to ``"auto"``. :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] :param export_checkpoints_dir: Automatically export generated models to this directory. Defaults to ``None``. :type export_checkpoints_dir: str, optional :param cluster_size_constraints: An array specifying the minimum number of points that should be in each cluster. The length of the constraints array has to be the same as the number of clusters. Defaults to ``None``. :type cluster_size_constraints: List[int], optional """ super(H2OKMeansEstimator, self).__init__() self._parms = {} self._id = self._parms['model_id'] = model_id self.training_frame = training_frame self.validation_frame = validation_frame self.nfolds = nfolds self.keep_cross_validation_models = keep_cross_validation_models self.keep_cross_validation_predictions = keep_cross_validation_predictions self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment self.fold_assignment = fold_assignment self.fold_column = fold_column self.ignored_columns = ignored_columns self.ignore_const_cols = ignore_const_cols self.score_each_iteration = score_each_iteration self.k = k self.estimate_k = estimate_k self.user_points = user_points self.max_iterations = max_iterations self.standardize = standardize self.seed = seed self.init = init self.max_runtime_secs = max_runtime_secs self.categorical_encoding = categorical_encoding self.export_checkpoints_dir = export_checkpoints_dir self.cluster_size_constraints = cluster_size_constraints @property def training_frame(self): """ Id of the training data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> prostate = h2o.import_file("") >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", ... "PSA", "VOL", "GLEASON"] >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234) >>> pros_km = H2OKMeansEstimator(seed=1234) >>> pros_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> pros_km.scoring_history() """ return self._parms.get("training_frame") @training_frame.setter def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') @property def validation_frame(self): """ Id of the validation data frame. Type: ``Union[None, str, H2OFrame]``. :examples: >>> prostate = h2o.import_file("") >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", ... "PSA", "VOL", "GLEASON"] >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234) >>> pros_km = H2OKMeansEstimator(seed=1234) >>> pros_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> pros_km.scoring_history() """ return self._parms.get("validation_frame") @validation_frame.setter def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') @property def nfolds(self): """ Number of folds for K-fold cross-validation (0 to disable or >= 2). Type: ``int``, defaults to ``0``. :examples: >>> benign = h2o.import_file("") >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK", ... "AGP1","AGMN","LIV","AGLP"] >>> train, valid = benign.split_frame(ratios=[.8], seed=1234) >>> benign_km = H2OKMeansEstimator(nfolds=5, seed=1234) >>> benign_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> benign_km.scoring_history() """ return self._parms.get("nfolds") @nfolds.setter def nfolds(self, nfolds): assert_is_type(nfolds, None, int) self._parms["nfolds"] = nfolds @property def keep_cross_validation_models(self): """ Whether to keep the cross-validation models. Type: ``bool``, defaults to ``True``. :examples: >>> ozone = h2o.import_file("") >>> predictors = ["radiation","temperature","wind"] >>> train, valid = ozone.split_frame(ratios=[.8], seed=1234) >>> ozone_km = H2OKMeansEstimator(keep_cross_validation_models=True, ... nfolds=5, ... seed=1234) >>> ozone_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> ozone_km.scoring_history() """ return self._parms.get("keep_cross_validation_models") @keep_cross_validation_models.setter def keep_cross_validation_models(self, keep_cross_validation_models): assert_is_type(keep_cross_validation_models, None, bool) self._parms["keep_cross_validation_models"] = keep_cross_validation_models @property def keep_cross_validation_predictions(self): """ Whether to keep the predictions of the cross-validation models. Type: ``bool``, defaults to ``False``. :examples: >>> prostate = h2o.import_file("") >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", ... "PSA", "VOL", "GLEASON"] >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234) >>> pros_km = H2OKMeansEstimator(keep_cross_validation_predictions=True, ... nfolds=5, ... seed=1234) >>> pros_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> pros_km.scoring_history() """ return self._parms.get("keep_cross_validation_predictions") @keep_cross_validation_predictions.setter def keep_cross_validation_predictions(self, keep_cross_validation_predictions): assert_is_type(keep_cross_validation_predictions, None, bool) self._parms["keep_cross_validation_predictions"] = keep_cross_validation_predictions @property def keep_cross_validation_fold_assignment(self): """ Whether to keep the cross-validation fold assignment. Type: ``bool``, defaults to ``False``. :examples: >>> ozone = h2o.import_file("") >>> predictors = ["radiation","temperature","wind"] >>> train, valid = ozone.split_frame(ratios=[.8], seed=1234) >>> ozone_km = H2OKMeansEstimator(keep_cross_validation_fold_assignment=True, ... nfolds=5, ... seed=1234) >>> ozone_km.train(x=predictors, ... training_frame=train) >>> ozone_km.scoring_history() """ return self._parms.get("keep_cross_validation_fold_assignment") @keep_cross_validation_fold_assignment.setter def keep_cross_validation_fold_assignment(self, keep_cross_validation_fold_assignment): assert_is_type(keep_cross_validation_fold_assignment, None, bool) self._parms["keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment @property def fold_assignment(self): """ Cross-validation fold assignment scheme, if fold_column is not specified. The 'Stratified' option will stratify the folds based on the response variable, for classification problems. Type: ``Literal["auto", "random", "modulo", "stratified"]``, defaults to ``"auto"``. :examples: >>> ozone = h2o.import_file("") >>> predictors = ["radiation","temperature","wind"] >>> train, valid = ozone.split_frame(ratios=[.8], seed=1234) >>> ozone_km = H2OKMeansEstimator(fold_assignment="Random", ... nfolds=5, ... seed=1234) >>> ozone_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> ozone_km.scoring_history() """ return self._parms.get("fold_assignment") @fold_assignment.setter def fold_assignment(self, fold_assignment): assert_is_type(fold_assignment, None, Enum("auto", "random", "modulo", "stratified")) self._parms["fold_assignment"] = fold_assignment @property def fold_column(self): """ Column with cross-validation fold index assignment per observation. Type: ``str``. :examples: >>> cars = h2o.import_file("") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> fold_numbers = cars.kfold_column(n_folds=5, seed=1234) >>> fold_numbers.set_names(["fold_numbers"]) >>> cars = cars.cbind(fold_numbers) >>> print(cars['fold_numbers']) >>> cars_km = H2OKMeansEstimator(seed=1234) >>> cars_km.train(x=predictors, ... training_frame=cars, ... fold_column="fold_numbers") >>> cars_km.scoring_history() """ return self._parms.get("fold_column") @fold_column.setter def fold_column(self, fold_column): assert_is_type(fold_column, None, str) self._parms["fold_column"] = fold_column @property def ignored_columns(self): """ Names of columns to ignore for training. Type: ``List[str]``. """ return self._parms.get("ignored_columns") @ignored_columns.setter def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns @property def ignore_const_cols(self): """ Ignore constant columns. Type: ``bool``, defaults to ``True``. :examples: >>> cars = h2o.import_file("") >>> predictors = ["displacement","power","weight","acceleration","year"] >>> cars["const_1"] = 6 >>> cars["const_2"] = 7 >>> train, valid = cars.split_frame(ratios=[.8], seed=1234) >>> cars_km = H2OKMeansEstimator(ignore_const_cols=True, ... seed=1234) >>> cars_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> cars_km.scoring_history() """ return self._parms.get("ignore_const_cols") @ignore_const_cols.setter def ignore_const_cols(self, ignore_const_cols): assert_is_type(ignore_const_cols, None, bool) self._parms["ignore_const_cols"] = ignore_const_cols @property def score_each_iteration(self): """ Whether to score during each iteration of model training. Type: ``bool``, defaults to ``False``. :examples: >>> benign = h2o.import_file("") >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK", ... "AGP1","AGMN","LIV","AGLP"] >>> train, valid = benign.split_frame(ratios=[.8], seed=1234) >>> benign_km = H2OKMeansEstimator(score_each_iteration=True, ... seed=1234) >>> benign_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> benign_km.scoring_history() """ return self._parms.get("score_each_iteration") @score_each_iteration.setter def score_each_iteration(self, score_each_iteration): assert_is_type(score_each_iteration, None, bool) self._parms["score_each_iteration"] = score_each_iteration @property def k(self): """ The max. number of clusters. If estimate_k is disabled, the model will find k centroids, otherwise it will find up to k centroids. Type: ``int``, defaults to ``1``. :examples: >>> seeds = h2o.import_file("") >>> predictors = seeds.columns[0:7] >>> train, valid = seeds.split_frame(ratios=[.8], seed=1234) >>> seeds_km = H2OKMeansEstimator(k=3, seed=1234) >>> seeds_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> seeds_km.scoring_history() """ return self._parms.get("k") @k.setter def k(self, k): assert_is_type(k, None, int) self._parms["k"] = k @property def estimate_k(self): """ Whether to estimate the number of clusters (<=k) iteratively and deterministically. Type: ``bool``, defaults to ``False``. :examples: >>> iris = h2o.import_file("") >>> iris['class'] = iris['class'].asfactor() >>> predictors = iris.columns[:-1] >>> train, valid = iris.split_frame(ratios=[.8], seed=1234) >>> iris_kmeans = H2OKMeansEstimator(k=10, ... estimate_k=True, ... standardize=False, ... seed=1234) >>> iris_kmeans.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> iris_kmeans.scoring_history() """ return self._parms.get("estimate_k") @estimate_k.setter def estimate_k(self, estimate_k): assert_is_type(estimate_k, None, bool) self._parms["estimate_k"] = estimate_k @property def user_points(self): """ This option allows you to specify a dataframe, where each row represents an initial cluster center. The user- specified points must have the same number of columns as the training observations. The number of rows must equal the number of clusters Type: ``Union[None, str, H2OFrame]``. :examples: >>> iris = h2o.import_file("") >>> iris['class'] = iris['class'].asfactor() >>> predictors = iris.columns[:-1] >>> train, valid = iris.split_frame(ratios=[.8], seed=1234) >>> point1 = [4.9,3.0,1.4,0.2] >>> point2 = [5.6,2.5,3.9,1.1] >>> point3 = [6.5,3.0,5.2,2.0] >>> points = h2o.H2OFrame([point1, point2, point3]) >>> iris_km = H2OKMeansEstimator(k=3, ... user_points=points, ... seed=1234) >>> iris_km.train(x=predictors, ... training_frame=iris, ... validation_frame=valid) >>> iris_kmeans.tot_withinss(valid=True) """ return self._parms.get("user_points") @user_points.setter def user_points(self, user_points): self._parms["user_points"] = H2OFrame._validate(user_points, 'user_points') @property def max_iterations(self): """ Maximum training iterations (if estimate_k is enabled, then this is for each inner Lloyds iteration) Type: ``int``, defaults to ``10``. :examples: >>> benign = h2o.import_file("") >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK", ... "AGP1","AGMN","LIV","AGLP"] >>> train, valid = benign.split_frame(ratios=[.8], seed=1234) >>> benign_km = H2OKMeansEstimator(max_iterations=50) >>> benign_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> benign_km.scoring_history() """ return self._parms.get("max_iterations") @max_iterations.setter def max_iterations(self, max_iterations): assert_is_type(max_iterations, None, int) self._parms["max_iterations"] = max_iterations @property def standardize(self): """ Standardize columns before computing distances Type: ``bool``, defaults to ``True``. :examples: >>> boston = h2o.import_file("") >>> predictors = boston.columns[:-1] >>> boston['chas'] = boston['chas'].asfactor() >>> train, valid = boston.split_frame(ratios=[.8]) >>> boston_km = H2OKMeansEstimator(standardize=True) >>> boston_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> boston_km.scoring_history() """ return self._parms.get("standardize") @standardize.setter def standardize(self, standardize): assert_is_type(standardize, None, bool) self._parms["standardize"] = standardize @property def seed(self): """ RNG Seed Type: ``int``, defaults to ``-1``. :examples: >>> prostate = h2o.import_file("") >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234) >>> pros_w_seed = H2OKMeansEstimator(seed=1234) >>> pros_w_seed.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> pros_wo_seed = H2OKMeansEstimator() >>> pros_wo_seed.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> pros_w_seed.scoring_history() >>> pros_wo_seed.scoring_history() """ return self._parms.get("seed") @seed.setter def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed @property def init(self): """ Initialization mode Type: ``Literal["random", "plus_plus", "furthest", "user"]``, defaults to ``"furthest"``. :examples: >>> seeds = h2o.import_file("") >>> predictors = seeds.columns[0:7] >>> train, valid = seeds.split_frame(ratios=[.8], seed=1234) >>> seeds_km = H2OKMeansEstimator(k=3, ... init='Furthest', ... seed=1234) >>> seeds_km.train(x=predictors, ... training_frame=train, ... validation_frame= valid) >>> seeds_km.scoring_history() """ return self._parms.get("init") @init.setter def init(self, init): assert_is_type(init, None, Enum("random", "plus_plus", "furthest", "user")) self._parms["init"] = init @property def max_runtime_secs(self): """ Maximum allowed runtime in seconds for model training. Use 0 to disable. Type: ``float``, defaults to ``0.0``. :examples: >>> benign = h2o.import_file("") >>> predictors = ["AGMT","FNDX","HIGD","DEG","CHK", ... "AGP1","AGMN","LIV","AGLP"] >>> train, valid = benign.split_frame(ratios=[.8], seed=1234) >>> benign_km = H2OKMeansEstimator(max_runtime_secs=10, ... seed=1234) >>> benign_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> benign_km.scoring_history() """ return self._parms.get("max_runtime_secs") @max_runtime_secs.setter def max_runtime_secs(self, max_runtime_secs): assert_is_type(max_runtime_secs, None, numeric) self._parms["max_runtime_secs"] = max_runtime_secs @property def categorical_encoding(self): """ Encoding scheme for categorical features Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. :examples: >>> prostate = h2o.import_file("") >>> predictors = ["AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"] >>> train, valid = prostate.split_frame(ratios=[.8], seed=1234) >>> encoding = "one_hot_explicit" >>> pros_km = H2OKMeansEstimator(categorical_encoding=encoding, ... seed=1234) >>> pros_km.train(x=predictors, ... training_frame=train, ... validation_frame=valid) >>> pros_km.scoring_history() """ return self._parms.get("categorical_encoding") @categorical_encoding.setter def categorical_encoding(self, categorical_encoding): assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) self._parms["categorical_encoding"] = categorical_encoding @property def export_checkpoints_dir(self): """ Automatically export generated models to this directory. Type: ``str``. :examples: >>> import tempfile >>> from os import listdir >>> airlines = h2o.import_file("", destination_frame="air.hex") >>> predictors = ["DayofMonth", "DayOfWeek"] >>> checkpoints_dir = tempfile.mkdtemp() >>> air_km = H2OKMeansEstimator(export_checkpoints_dir=checkpoints_dir, ... seed=1234) >>> air_km.train(x=predictors, training_frame=airlines) >>> len(listdir(checkpoints_dir)) """ return self._parms.get("export_checkpoints_dir") @export_checkpoints_dir.setter def export_checkpoints_dir(self, export_checkpoints_dir): assert_is_type(export_checkpoints_dir, None, str) self._parms["export_checkpoints_dir"] = export_checkpoints_dir @property def cluster_size_constraints(self): """ An array specifying the minimum number of points that should be in each cluster. The length of the constraints array has to be the same as the number of clusters. Type: ``List[int]``. :examples: >>> iris_h2o = h2o.import_file("") >>> k=3 >>> start_points = h2o.H2OFrame( ... [[4.9, 3.0, 1.4, 0.2], ... [5.6, 2.5, 3.9, 1.1], ... [6.5, 3.0, 5.2, 2.0]]) >>> kmm = H2OKMeansEstimator(k=k, ... user_points=start_points, ... standardize=True, ... cluster_size_constraints=[2, 5, 8], ... score_each_iteration=True) >>> kmm.train(x=list(range(7)), training_frame=iris_h2o) >>> kmm.scoring_history() """ return self._parms.get("cluster_size_constraints") @cluster_size_constraints.setter def cluster_size_constraints(self, cluster_size_constraints): assert_is_type(cluster_size_constraints, None, [int]) self._parms["cluster_size_constraints"] = cluster_size_constraints