Source code for h2o4gpu.util.metrics

# - * - encoding : utf - 8 - * -
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
import numpy as np
import pandas as pd


[docs]def ll(actual, predicted):
    """
    Computes the log likelihood.

    This function computes the log likelihood between two numbers,
    or for element between a pair of lists or numpy arrays.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double or list of doubles
             The log likelihood error between actual and predicted
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    for i in range(0, predicted.shape[0]):
        predicted[i] = min(max(1e-15, predicted[i]), 1 - 1e-15)
    err = np.seterr(all='ignore')
    score = -(actual * np.log(predicted) +
              (1 - actual) * np.log(1 - predicted))
    np.seterr(
        divide=err['divide'],
        over=err['over'],
        under=err['under'],
        invalid=err['invalid'])
    if isinstance(score, np.ndarray):
        score[np.isnan(score)] = 0
    else:
        if np.isnan(score):
            score = 0
    return score


[docs]def log_loss(actual, predicted):
    """
    Computes the log loss.

    This function computes the log loss between two lists
    of numbers.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
             The log loss between actual and predicted
    """
    return np.mean(ll(actual, predicted))


[docs]def se(actual, predicted):
    """
    Computes the squared error.

    This function computes the squared error between two numbers,
    or for element between a pair of lists or numpy arrays.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double or list of doubles
            The squared error between actual and predicted
    """
    return np.power(np.array(actual) - np.array(predicted), 2)


[docs]def mse(actual, predicted):
    """
    Computes the mean squared error.

    This function computes the mean squared error between two lists
    of numbers.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
             The mean squared error between actual and predicted
    """
    return np.mean(se(actual, predicted))


[docs]def rmse(actual, predicted):
    """
    Computes the root mean squared error.

    This function computes the root mean squared error between two lists
    of numbers.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
            The root mean squared error between actual and predicted
    """
    return np.sqrt(mse(actual, predicted))


[docs]def ce(actual, predicted):
    """
    Computes the classification error.

    This function computes the classification error between two lists

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
            The classification error between actual and predicted
    """
    return (
        sum([1.0 for x, y in zip(actual, predicted) if x != y]) / len(actual))


[docs]def ae(actual, predicted):
    """
    Computes the absolute error.

    This function computes the absolute error between two numbers,
    or for element between a pair of lists or numpy arrays.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double or list of doubles
            The absolute error between actual and predicted
    """
    return np.abs(np.array(actual) - np.array(predicted))


[docs]def mae(actual, predicted):
    """
    Computes the mean absolute error.

    This function computes the mean absolute error between two lists
    of numbers.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
            The mean absolute error between actual and predicted
    """
    return np.mean(ae(actual, predicted))


[docs]def sle(actual, predicted):
    """
    Computes the squared log error.

    This function computes the squared log error between two numbers,
    or for element between a pair of lists or numpy arrays.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double or list of doubles
             The squared log error between actual and predicted
    """
    return (np.power(
        np.log(np.array(actual) + 1) - np.log(np.array(predicted) + 1), 2))


[docs]def msle(actual, predicted):
    """
    Computes the mean squared log error.

    This function computes the mean squared log error between two lists
    of numbers.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
            The mean squared log error between actual and predicted
    """
    return np.mean(sle(actual, predicted))


[docs]def rmsle(actual, predicted):
    """
    Computes the root mean squared log error.

    This function computes the root mean squared log error between two lists
    of numbers.

    :param actual: int, float, list of numbers, numpy array
                    The ground truth value
    :param predicted: same type as actual
                     The predicted value

    :returns: double
            The root mean squared log error between actual and predicted
    """
    return np.sqrt(msle(actual, predicted))


[docs]def tied_rank(x):
    """
    Computes the tied rank of elements in x.

    This function computes the tied rank of elements in x.

    :param x: list of numbers, numpy array

    :returns: list of numbers
            The tied rank f each element in x
    """
    sorted_x = sorted(zip(x, range(len(x))))
    r = [0] * len(x)
    cur_val = sorted_x[0][0]
    last_rank = 0
    for i, e in enumerate(sorted_x):
        if cur_val != e[0]:
            cur_val = e[0]
            for j in range(last_rank, i):
                r[sorted_x[j][1]] = float(last_rank + 1 + i) / 2.0
            last_rank = i
        if i == len(sorted_x) - 1:
            for j in range(last_rank, i + 1):
                r[e[1]] = float(last_rank + i + 2) / 2.0
    return r


[docs]def auc(actual, posterior):
    """
    Computes the area under the receiver-operater characteristic (AUC)

    This function computes the AUC error metric for binary classification.

    :param actual: list of binary numbers, numpy array
                    The ground truth value
    :param posterior: same type as actual
                       Defines a ranking on the binary numbers,
                       from most likely to be positive to least
                       likely to be positive.

    :returns: double
             The AUC between actual and posterior
    """
    r = tied_rank(posterior)
    num_positive = len([0 for x in actual if x == 1])
    num_negative = len(actual) - num_positive
    sum_positive = sum([r[i] for i in range(len(r)) if actual[i] == 1])
    area_under_curve = ((sum_positive - num_positive *
                         (num_positive + 1) / 2.0) /
                        (num_negative * num_positive))
    return area_under_curve


[docs]def f05_opt(actual, predicted, sample_weight=None):
    """
    Computes the F0.5-Score after optimal predictions thresholding.

    This function maximizes the F0.5-Score by means of
    optimal predictions thresholding.

    :param actual: numpy array
                    The ground truth value
    :param predicted: numpy array
                       The predicted value
    :param sample_weight: numpy array or None
                           sample weights

    :returns: double
             The optimal F0.5-Score
    """
    from ..libs.lib_utils import CPUlib
    lib = CPUlib.get()
    if sample_weight is None:
        return lib.f05_opt(actual.ravel(), predicted.ravel())
    return lib.f05_opt(actual.ravel(), predicted.ravel(), sample_weight.ravel())


[docs]def f1_opt(actual, predicted, sample_weight=None):
    """
    Computes the F1-Score after optimal predictions thresholding.

    This function maximizes the F1-Score by means of
    optimal predictions thresholding.

    :param actual: numpy array
                    The ground truth value
    :param predicted: numpy array
                       The predicted value
    :param sample_weight: numpy array or None
                           sample weights

    :returns: double
             The optimal F1-Score
    """
    from ..libs.lib_utils import CPUlib
    lib = CPUlib.get()
    if sample_weight is None:
        return lib.f1_opt(actual.ravel(), predicted.ravel())
    return lib.f1_opt(actual.ravel(), predicted.ravel(),
                      sample_weight.ravel())


[docs]def f2_opt(actual, predicted, sample_weight=None):
    """
        Computes the F2-Score after optimal predictions thresholding.

        This function maximizes the F2-Score by means of
        optimal predictions thresholding.

        :param actual: numpy array
                        The ground truth value
        :param predicted: numpy array
                           The predicted value
        :param sample_weight: numpy array or None
                               sample weights

        :returns: double
                 The optimal F2-Score
        """
    from ..libs.lib_utils import CPUlib
    lib = CPUlib.get()
    if sample_weight is None:
        return lib.f2_opt(actual.ravel(), predicted.ravel())
    return lib.f2_opt(actual.ravel(), predicted.ravel(),
                      sample_weight.ravel())


[docs]def mcc_opt(actual, predicted, sample_weight=None):
    """
    Computes the MCC after optimal predictions thresholding.

    This function maximizes the Matthews Correlation Coefficient (MCC)
    by means of optimal predictions thresholding.

    :param actual: numpy array
                    The ground truth value
    :param predicted: numpy array
                       The predicted value
    :param sample_weight: numpy array or None
                           sample weights

    :returns: double
             The optimal MCC
    """
    from ..libs.lib_utils import CPUlib
    lib = CPUlib.get()
    if sample_weight is None:
        return lib.mcc_opt(actual.ravel(), predicted.ravel())
    return lib.mcc_opt(actual.ravel(), predicted.ravel(),
                       sample_weight.ravel())


[docs]def acc_opt(actual, predicted, sample_weight=None):
    """
    Computes the Accuracy after optimal predictions thresholding.

    This function maximizes the Accuracy
    by means of optimal predictions thresholding.

    :param actual: numpy array
                    The ground truth value
    :param predicted: numpy array
                       The predicted value
    :param sample_weight: numpy array or None
                           sample weights

    :returns: double
             The optimal Accuracy
    """
    from ..libs.lib_utils import CPUlib
    lib = CPUlib.get()
    if sample_weight is None:
        return lib.acc_opt(actual.ravel(), predicted.ravel())
    return lib.acc_opt(actual.ravel(), predicted.ravel(),
                       sample_weight.ravel())


[docs]def confusion_matrices(actual, predicted, sample_weight=None):
    """
    Computes confusion matrices for ROC analysis.

    This function cumputes confusion matrices
    for all possible prediction thresholds.

    :param actual: numpy array
                    The ground truth value
    :param predicted: numpy array
                       The predicted value
    :param sample_weight: numpy array or None
                           sample weights

    :returns: pandas DataFrame
             Confusion matrices for each unique predicted value as threshold
    """
    cm_stats_cols = ['p', 'tp', 'tn', 'fp', 'fn',
                     'fpr', 'tpr', 'mcc', 'f1', 'f05', 'f2']

    res = np.zeros((actual.shape[0], len(cm_stats_cols)))
    from ..libs.lib_utils import CPUlib
    lib = CPUlib.get()
    if sample_weight is None:
        lib.confusion_matrices(actual.ravel(), predicted.ravel(), res)
    else:
        lib.confusion_matrices(actual.ravel(), predicted.ravel(),
                               sample_weight.ravel(), res)
    return pd.DataFrame(res[~np.all(res == 0, axis=1)],
                        columns=cm_stats_cols)