Source code for h2o4gpu.util.import_data

# - * - encoding : utf - 8 - * -
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license:   Apache License Version 2.0 (see LICENSE for details)
"""


[docs]def import_data(data_path, use_pandas=False, intercept=True, valid_fraction=0.2, classification=True): """Import Data for H2O GPU Edition This function will read in data and prepare it for H2O4GPU's GLM solver. Note, the data is assumed to be all numeric,i.e., categoricals are one hot encoded, etc. :param data_path: str A path to a dataset (The dataset needs to be all numeric) :param use_pandas: bool Indicate if Pandas should be used to parse :param intercept: bool Indicate if intercept term is needed :param valid_fraction: float Percentage of dataset reserved for a validation set :param classification: bool Classification problem? :returns If valid_fraction > 0 it will return the following: train_x: numpy array of train input variables train_y: numpy array of y variable valid_x: numpy array of valid input variables valid_y: numpy array of valid y variable family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet" If valid_fraction == 0 it will return the following: train_x: numpy array of train input variables train_y: numpy array of y variable family : string that would either be "logistic" if classification is set to True,otherwise "elasticnet" """ data_file = data_path # If importing using pandas if use_pandas: print("Reading Data with Pandas") import pandas as pd data = pd.read_csv(data_file) else: print("Reading Data with Feather") import feather data = feather.read_dataframe(data_file) print(data.shape) import numpy as np data_x = np.array( data.iloc[:, :data.shape[1] - 1], dtype='float32', order='C', copy=False) data_y = np.array( data.iloc[:, data.shape[1] - 1], dtype='float32', order='C', copy=False) # Setup train / validation set split # (assuming form of mxn where m = row count and n = col count) morig = data_x.shape[0] norig = data_x.shape[1] print("Original m=%d n=%d" % (morig, norig)) import sys sys.stdout.flush() # Do train / valid split if valid_fraction > 0: HO = int(valid_fraction * morig) H = morig - HO print("Size of Train rows=%d & valid rows=%d" % (H, HO)) sys.stdout.flush() train_x = data_x[0:H, :] train_y = data_y[0:H] valid_x = data_x[H:morig, :] valid_y = data_y[H:morig] print("Size of Train cols=%d valid cols=%d" % (train_x.shape[1], valid_x.shape[1])) else: train_x = data_x train_y = data_y # Using intercept if intercept: train_x = np.hstack( [train_x, np.ones((train_x.shape[0], 1), dtype=train_x.dtype)]) if valid_fraction > 0: valid_x = np.hstack( [valid_x, np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)]) print("Size of Train cols=%d & valid cols=%d after adding " "intercept column" % (train_x.shape[1], valid_x.shape[1])) else: print("Size of Train cols=%d after adding intercept column" % (train_x.shape[1])) if classification: family = "logistic" else: family = "elasticnet" if valid_fraction > 0: return train_x, train_y, valid_x, valid_y, family return train_x, train_y, family