custom_distribution_func

  • Available in: GBM

  • Hyperparameter: no

Description

Use this option to specify a custom distribution function, which can then be used to customize a loss function calculation.

Notes:

Example

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o.cluster().show_status()

# Import the airlines dataset:
# this dataset is used to classify whether a flight will be delayed 'YES' or not "NO"
# original data can be found at http://www.transtats.bts.gov/
airlines = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")

# convert columns to factors
airlines["Year"] = airlines["Year"].asfactor()
airlines["Month"] = airlines["Month"].asfactor()
airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
airlines["Cancelled"] = airlines["Cancelled"].asfactor()
airlines['FlightNum'] = airlines['FlightNum'].asfactor()

# set the predictor names and the response column name
predictors = ["Origin", "Dest", "Year", "UniqueCarrier",
              "DayOfWeek", "Month", "Distance", "FlightNum"]
response = "IsDepDelayed"

# split into train and validation sets
train, valid = airlines.split_frame(ratios=[.8], seed=1234)

# initialize the estimator then train the model
airlines_gbm = H2OGradientBoostingEstimator(ntrees=3,
                                            max_depth=5,
                                            distribution="bernoulli",
                                            seed=1234)

airlines_gbm.train(x=predictors, y=response, training_frame=train, validation_frame=valid)

# print the auc for the validation data
print(airlines_gbm.auc(valid=True))

# use a custom distribution now
# create a custom Bernoulli distribution and save as custom_bernoulli.py
# note that this references a java class java.lang.Math

class MyBernoulli():

        def exp(self, x):
                import java.lang.Math as Math
                max_exp = 1e19
                return Math.min(max_exp, Math.exp(x))

        def link(self):
                return "logit"

        def init(self, w, o, y):
                return [w * (y - o), w]

        def gradient(self, y, f):
                return y - (1 / (1 + self.exp(-f)))

        def gamma(self, w, y, z, f):
                ff = y - z
                return [w * z, w * ff * (1 - ff)]


# upload the custom distribution
custom_dist_func = h2o.upload_custom_distribution(MyBernoulli,
                                                  func_name="custom_bernoulli",
                                                  func_file="custom_bernoulli.py")

# train the model
airlines_gbm_custom = H2OGradientBoostingEstimator(ntrees=3,
                                                   max_depth=5,
                                                   distribution="custom",
                                                   custom_distribution_func=custom_dist_func,
                                                   seed=1234)

airlines_gbm_custom.train(x=predictors, y=response,
                          training_frame=train, validation_frame=valid)

# print the auc for the validation data - the result should be the same
print(airlines_gbm_custom.auc(valid=True))

# To customize a distribution for special type of problem we recommend you to inherit from predefined classes:
# - CustomDistributionGaussian - for regression problems
# - CustomDistributionBernoulli - for 2-class classification problems
# - CustomDistributionMultinomial - for n-class classification problems

# For example if you want to apply asymmetric loss function in a classification problem, you can implement a class
# which inherits from CustomDistributionBernoulli

from h2o.utils.distributions import CustomDistributionBernoulli

class MyBernoulliAsymmetric(CustomDistributionBernoulli):
        def gradient(self, y, f):
                error = y - (1 / (1 + self.exp(-f)))
                return 0.5 * error if error < 0 else 2 * error


# Upload the custom distribution
custom_dist_func = h2o.upload_custom_distribution(MyBernoulliAsymmetric,
                                                  func_name="custom_bernoulli_asym",
                                                  func_file="custom_bernoulli_asym.py")

# Train the model
airlines_gbm_custom_asym = H2OGradientBoostingEstimator(ntrees=3,
                                                        max_depth=5,
                                                        distribution="custom",
                                                        custom_distribution_func=custom_dist_func,
                                                        seed=1234)

airlines_gbm_custom_asym.train(x=predictors, y=response,
                               training_frame=train, validation_frame=valid)
print(airlines_gbm_custom_asym.auc(valid=True))