public class ModelSelectionUtils
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
static class |
ModelSelectionUtils.SweepVector
store information on sweeping actions that are to be performed to new rows/columns added to CPM due to the
addition of new predcitors.
|
Constructor and Description |
---|
ModelSelectionUtils() |
Modifier and Type | Method and Description |
---|---|
static double[][] |
addNewPred2CPM(double[][] allCPM,
water.fvec.Frame allCPMFrame,
double[][] currentCPM,
int[] subsetPredIndex,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given current CPM which has been swept already, we need to add the lastest predictor to the current CPM that have
not been swept.
|
static void |
applySweepVectors2NewPred(ModelSelectionUtils.SweepVector[][] sweepVec,
double[][] subsetCPM,
int numNewRows,
int[] sweepMat)
This method will sweep the rows/columns added to the CPM due to the addition of the new predictor using sweep
vector arrays.
|
static GLM[] |
buildGLMBuilders(GLMModel.GLMParameters[] trainingParams) |
static double |
calR2Scale(water.fvec.Frame train,
java.lang.String resp) |
static double[] |
dropIgnoredCols(GLMTask.GLMIterationTask gtask,
java.util.List<java.lang.Integer> ignoredCols) |
static java.util.List<java.lang.Integer> |
extractCPMIndexFromPred(int cpmLastIndex,
int[][] pred2CPMIndices,
int[] newPredList,
boolean hasIntercept) |
static java.util.List<java.lang.Integer> |
extractCPMIndexFromPredOnly(int[][] pred2CPMIndices,
int[] newPredList)
Given the predictor in subset newPredList, this function will find the rows/columns in the cpm matrix that
are contributed by the predictors in subset newPredList.
|
static java.lang.String[] |
extractPredictorNames(hex.Model.Parameters parms,
DataInfo dinfo,
java.lang.String foldColumn) |
static double[][] |
extractPredSubsetsCPM(double[][] allCPM,
int[] predIndices,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given a predictor subset and the complete CPM, we extract the CPM associated with the predictors
specified in the predictor subset (predIndices).
|
static double[][] |
extractPredSubsetsCPMFrame(water.fvec.Frame allCPM,
int[] predIndices,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given a predictor subset and the complete CPM, we extract the CPM associated with the predictors
specified in the predictor subset (predIndices).
|
static int[] |
extractSweepIndices(java.util.List<java.lang.Integer> currSubsetIndices,
int predPos,
int predRemoved,
int[][] predInd2CPMIndices,
boolean hasIntercept)
Given predRemoved (the predictor that is to be removed and replaced in the forward step), this method will
calculate the locations of the CPM rows/columns associated with it.
|
static java.util.List<java.lang.String> |
extraModelColumnNames(java.util.List<java.lang.String> coefNames,
GLMModel bestModel) |
static GLMModel |
findBestModel(GLM[] glmResults)
Given GLM run results of a fixed number of predictors, find the model with the best R2 value.
|
static hex.modelselection.ModelSelectionUtils.PredNameMinZVal |
findCatMinOfMaxZScore(GLMModel model,
java.util.List<java.lang.Double> zValList)
This method extracts the categorical coefficient z-score (abs(z-value)) by using the following method:
1.
|
static java.util.List<java.lang.Integer> |
findFullDupPred(DataInfo dinfo,
java.util.List<java.lang.Integer> ignoredCols,
java.util.List<java.lang.String> ignoredPredNames,
java.util.List<java.lang.String> ignoredCoefNames,
java.lang.String[] prednames)
The duplicated columns generated by qr-cholesky is at the level of coefficients.
|
static int |
findMinZValue(GLMModel model,
java.util.List<java.lang.String> numPredNames,
java.util.List<java.lang.String> catPredNames,
java.util.List<java.lang.String> predNames) |
static hex.modelselection.ModelSelectionUtils.PredNameMinZVal |
findNumMinZVal(java.util.List<java.lang.String> numPredNames,
java.util.List<java.lang.Double> zValList,
java.util.List<java.lang.String> coeffNames) |
static double[][] |
formCPM(Gram gram,
double[] xTransposey,
double yy) |
static hex.modelselection.ModelSelectionUtils.CPMnPredNames |
genCPMPredNamesIndex(water.Key jobKey,
DataInfo dinfo,
java.lang.String[] predictorNames,
ModelSelectionModel.ModelSelectionParameters parms) |
static double[] |
generateAllErrVar(double[][] allCPM,
water.fvec.Frame allCPMFrame,
int prevCPMSize,
java.util.List<java.lang.Integer> currSubsetIndices,
java.util.List<java.lang.Integer> validSubsets,
java.util.Set<java.util.BitSet> usedCombo,
java.util.BitSet tempIndices,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given the original predictor subset, this function will go into a for loop and choose one predictor out of the
remaining predictor set validSubsets and put it into the array allPreds.
|
static double[] |
generateAllErrVarR(double[][] allCPM,
water.fvec.Frame allCPMFrame,
double[][] prevCPM,
int predPos,
java.util.List<java.lang.Integer> currSubsetIndices,
java.util.List<java.lang.Integer> validSubsets,
java.util.Set<java.util.BitSet> usedCombo,
java.util.BitSet tempIndices,
int[][] pred2CPMIndices,
boolean hasIntercept,
int[] removedPredSweepInd,
ModelSelectionUtils.SweepVector[][] removedPredSV)
Given the original predictor subset, this function will go into a for loop and choose one predictor out of the
remaining predictor set validSubsets and put it into the array allPreds.
|
static GLMModel.GLMParameters[] |
generateGLMParameters(water.fvec.Frame[] trainingFrames,
ModelSelectionModel.ModelSelectionParameters parms,
int nfolds,
java.lang.String foldColumn,
hex.Model.Parameters.FoldAssignmentScheme foldAssignment) |
static water.fvec.Frame[] |
generateMaxRTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms,
java.lang.String[] predictorNames,
java.lang.String foldColumn,
java.util.List<java.lang.Integer> currSubsetIndices,
int newPredPos,
java.util.List<java.lang.Integer> validSubsets,
java.util.Set<java.util.BitSet> usedCombo)
double
|
static water.fvec.Frame |
generateOneFrame(int[] predIndices,
hex.Model.Parameters parms,
java.lang.String[] predNames,
java.lang.String foldColumn)
Given a predictor indices set, this function will generate a training frame containing the predictors with
indices in predIndices.
|
static water.fvec.Frame[] |
generateTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms,
int predNum,
java.lang.String[] predNames,
int numModels,
java.lang.String foldColumn) |
static GLMTask.GLMIterationTask |
genGramCheckDup(water.Key jobKey,
DataInfo dinfo,
java.util.ArrayList<java.lang.Integer> ignoredCols,
ModelSelectionModel.ModelSelectionParameters parms) |
static void |
genMSE1stPred(int[][] pred2CPMIndices,
double[][] allCPM,
water.fvec.Frame allCPMFrame,
int[] allPreds,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept)
This method will calculate the variance variance when only one predictor is considered in allPreds.
|
static void |
genMSE4MorePreds(int[][] pred2CPMIndices,
double[][] allCPM,
water.fvec.Frame allCPMFrame,
int[] allPreds,
int lastSweepIndex,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept)
This method will calculate the error variance value for all predictors in the allPreds.
|
static void |
genMSE4MorePredsR(int[][] pred2CPMIndices,
double[][] allCPM,
water.fvec.Frame allCPMFrame,
double[][] prevCPM,
int[] allPreds,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept,
ModelSelectionUtils.SweepVector[][] removePredSV,
int[] removedPredSweepInd)
Generate the error variance for one predictor subset setting in allPreds.
|
static java.lang.String |
joinDouble(double[] val) |
static ModelSelectionUtils.SweepVector[][] |
mapBasicVector2Multiple(ModelSelectionUtils.SweepVector[][] sweepVec,
int newPredCPMLen)
When multiple rows/columns are added to the CPM due to the new predictor being categorical, we need to map the
old sweep vector arrays to new bigger sweep vector arrays.
|
static int[][] |
mapPredIndex2CPMIndices(DataInfo dinfo,
int numPreds,
java.util.List<java.lang.Integer> ignoredPredInd)
This method attempts to map all predictors into the corresponding cpm indices that refer to that predictor.
|
static void |
oneSweepWSweepVector(ModelSelectionUtils.SweepVector[] sweepVec,
double[][] subsetCPM,
int sweepIndex,
int colRowsAdded)
This method perform just one sweep of the sweeping action described in Step 3 of section V.II.IV of doc.
|
static void |
performOneSweep(double[][] subsetCPM,
ModelSelectionUtils.SweepVector[] sweepVec,
int sweepIndex,
boolean genSweepVector)
Perform one sweep according to section II of doc and generate sweep vector according to section V.II of doc.
|
static void |
process(hex.modelselection.ModelSelectionUtils.SweepElement currEle,
java.util.List<hex.modelselection.ModelSelectionUtils.SweepElement> tempList)
This method will generate all the elements that are needed to perform sweeping on the currEle.
|
static void |
removeTrainingFrames(water.fvec.Frame[] trainingFrames) |
static void |
setBitSet(java.util.BitSet predBitSet,
int[] currIndices) |
static void |
setParamField(hex.Model.Parameters params,
GLMModel.GLMParameters glmParam,
boolean superClassParams,
java.lang.reflect.Field[] paramFields,
java.util.List<java.lang.String> excludeList) |
static double[][] |
shrinkDoubleArray(double[][] array,
int numModels) |
static water.Key[] |
shrinkKeyArray(water.Key[] array,
int numModels) |
static java.lang.String[][] |
shrinkStringArray(java.lang.String[][] array,
int numModels) |
static ModelSelectionUtils.SweepVector[][] |
sweepCPM(double[][] subsetCPM,
int[] sweepIndices,
boolean genSweepVector)
This method perform the sweeping action described in section II of doc.
|
static void |
sweepCPMElements(java.util.Set<hex.modelselection.ModelSelectionUtils.SweepElement>[] sweepElements,
double[][] subsetCPM) |
static void |
sweepCPMParallel(water.fvec.Frame cpm,
int[] sweepIndices,
int[] trackPivotSweeps) |
static double |
sweepMSE(double[][] subsetCPM,
java.util.List<java.lang.Integer> sweepIndices)
This function performs sweeping on the last row and column only to update the variance error to reduce
computation time.
|
static void |
updateLaterIndices(int[] currentPredIndices,
int indexUpdated,
int lastPredInd)
Give 5 predictors and say we want the combo of 3 predictors, this function will properly reset the prediction
combination indices say from [0, 1, 4] -> [0, 2, 3] or [0, 3, 4] -> [1, 2, 3].
|
static void |
updatePredIndices(int[] currentPredIndices,
int[] indicesBounds)
Given predictor indices stored in currentPredIndices, we need to find the next combination of predictor indices
to use to generate the next combination.
|
public static water.fvec.Frame[] generateTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms, int predNum, java.lang.String[] predNames, int numModels, java.lang.String foldColumn)
public static void updatePredIndices(int[] currentPredIndices, int[] indicesBounds)
currentPredIndices
- indicesBounds
- public static void updateLaterIndices(int[] currentPredIndices, int indexUpdated, int lastPredInd)
currentPredIndices
- indexUpdated
- lastPredInd
- public static water.fvec.Frame generateOneFrame(int[] predIndices, hex.Model.Parameters parms, java.lang.String[] predNames, java.lang.String foldColumn)
predIndices
- parms
- predNames
- public static void setBitSet(java.util.BitSet predBitSet, int[] currIndices)
public static hex.modelselection.ModelSelectionUtils.CPMnPredNames genCPMPredNamesIndex(water.Key jobKey, DataInfo dinfo, java.lang.String[] predictorNames, ModelSelectionModel.ModelSelectionParameters parms)
public static int[][] mapPredIndex2CPMIndices(DataInfo dinfo, int numPreds, java.util.List<java.lang.Integer> ignoredPredInd)
public static double[][] formCPM(Gram gram, double[] xTransposey, double yy)
public static double[] dropIgnoredCols(GLMTask.GLMIterationTask gtask, java.util.List<java.lang.Integer> ignoredCols)
public static java.util.List<java.lang.Integer> findFullDupPred(DataInfo dinfo, java.util.List<java.lang.Integer> ignoredCols, java.util.List<java.lang.String> ignoredPredNames, java.util.List<java.lang.String> ignoredCoefNames, java.lang.String[] prednames)
public static GLMTask.GLMIterationTask genGramCheckDup(water.Key jobKey, DataInfo dinfo, java.util.ArrayList<java.lang.Integer> ignoredCols, ModelSelectionModel.ModelSelectionParameters parms)
public static double calR2Scale(water.fvec.Frame train, java.lang.String resp)
public static water.fvec.Frame[] generateMaxRTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms, java.lang.String[] predictorNames, java.lang.String foldColumn, java.util.List<java.lang.Integer> currSubsetIndices, int newPredPos, java.util.List<java.lang.Integer> validSubsets, java.util.Set<java.util.BitSet> usedCombo)
predictorNames
- foldColumn
- currSubsetIndices
- validSubsets
- Lists containing only valid predictor indices to choose frompublic static double[] generateAllErrVarR(double[][] allCPM, water.fvec.Frame allCPMFrame, double[][] prevCPM, int predPos, java.util.List<java.lang.Integer> currSubsetIndices, java.util.List<java.lang.Integer> validSubsets, java.util.Set<java.util.BitSet> usedCombo, java.util.BitSet tempIndices, int[][] pred2CPMIndices, boolean hasIntercept, int[] removedPredSweepInd, ModelSelectionUtils.SweepVector[][] removedPredSV)
public static void genMSE4MorePredsR(int[][] pred2CPMIndices, double[][] allCPM, water.fvec.Frame allCPMFrame, double[][] prevCPM, int[] allPreds, double[] subsetMSE, jsr166y.RecursiveAction[] resA, int resCount, boolean hasIntercept, ModelSelectionUtils.SweepVector[][] removePredSV, int[] removedPredSweepInd)
public static double[] generateAllErrVar(double[][] allCPM, water.fvec.Frame allCPMFrame, int prevCPMSize, java.util.List<java.lang.Integer> currSubsetIndices, java.util.List<java.lang.Integer> validSubsets, java.util.Set<java.util.BitSet> usedCombo, java.util.BitSet tempIndices, int[][] pred2CPMIndices, boolean hasIntercept)
public static void genMSE4MorePreds(int[][] pred2CPMIndices, double[][] allCPM, water.fvec.Frame allCPMFrame, int[] allPreds, int lastSweepIndex, double[] subsetMSE, jsr166y.RecursiveAction[] resA, int resCount, boolean hasIntercept)
public static double sweepMSE(double[][] subsetCPM, java.util.List<java.lang.Integer> sweepIndices)
public static void sweepCPMElements(java.util.Set<hex.modelselection.ModelSelectionUtils.SweepElement>[] sweepElements, double[][] subsetCPM)
public static void process(hex.modelselection.ModelSelectionUtils.SweepElement currEle, java.util.List<hex.modelselection.ModelSelectionUtils.SweepElement> tempList)
public static void genMSE1stPred(int[][] pred2CPMIndices, double[][] allCPM, water.fvec.Frame allCPMFrame, int[] allPreds, double[] subsetMSE, jsr166y.RecursiveAction[] resA, int resCount, boolean hasIntercept)
public static ModelSelectionUtils.SweepVector[][] mapBasicVector2Multiple(ModelSelectionUtils.SweepVector[][] sweepVec, int newPredCPMLen)
public static void applySweepVectors2NewPred(ModelSelectionUtils.SweepVector[][] sweepVec, double[][] subsetCPM, int numNewRows, int[] sweepMat)
public static void oneSweepWSweepVector(ModelSelectionUtils.SweepVector[] sweepVec, double[][] subsetCPM, int sweepIndex, int colRowsAdded)
public static double[][] addNewPred2CPM(double[][] allCPM, water.fvec.Frame allCPMFrame, double[][] currentCPM, int[] subsetPredIndex, int[][] pred2CPMIndices, boolean hasIntercept)
public static int[] extractSweepIndices(java.util.List<java.lang.Integer> currSubsetIndices, int predPos, int predRemoved, int[][] predInd2CPMIndices, boolean hasIntercept)
public static java.util.List<java.lang.Integer> extractCPMIndexFromPred(int cpmLastIndex, int[][] pred2CPMIndices, int[] newPredList, boolean hasIntercept)
public static java.util.List<java.lang.Integer> extractCPMIndexFromPredOnly(int[][] pred2CPMIndices, int[] newPredList)
public static ModelSelectionUtils.SweepVector[][] sweepCPM(double[][] subsetCPM, int[] sweepIndices, boolean genSweepVector)
public static void sweepCPMParallel(water.fvec.Frame cpm, int[] sweepIndices, int[] trackPivotSweeps)
public static void performOneSweep(double[][] subsetCPM, ModelSelectionUtils.SweepVector[] sweepVec, int sweepIndex, boolean genSweepVector)
public static java.lang.String[][] shrinkStringArray(java.lang.String[][] array, int numModels)
public static double[][] shrinkDoubleArray(double[][] array, int numModels)
public static water.Key[] shrinkKeyArray(water.Key[] array, int numModels)
public static java.lang.String joinDouble(double[] val)
public static GLMModel.GLMParameters[] generateGLMParameters(water.fvec.Frame[] trainingFrames, ModelSelectionModel.ModelSelectionParameters parms, int nfolds, java.lang.String foldColumn, hex.Model.Parameters.FoldAssignmentScheme foldAssignment)
public static void setParamField(hex.Model.Parameters params, GLMModel.GLMParameters glmParam, boolean superClassParams, java.lang.reflect.Field[] paramFields, java.util.List<java.lang.String> excludeList)
public static GLM[] buildGLMBuilders(GLMModel.GLMParameters[] trainingParams)
public static void removeTrainingFrames(water.fvec.Frame[] trainingFrames)
public static GLMModel findBestModel(GLM[] glmResults)
glmResults
- public static java.lang.String[] extractPredictorNames(hex.Model.Parameters parms, DataInfo dinfo, java.lang.String foldColumn)
public static int findMinZValue(GLMModel model, java.util.List<java.lang.String> numPredNames, java.util.List<java.lang.String> catPredNames, java.util.List<java.lang.String> predNames)
public static hex.modelselection.ModelSelectionUtils.PredNameMinZVal findNumMinZVal(java.util.List<java.lang.String> numPredNames, java.util.List<java.lang.Double> zValList, java.util.List<java.lang.String> coeffNames)
public static hex.modelselection.ModelSelectionUtils.PredNameMinZVal findCatMinOfMaxZScore(GLMModel model, java.util.List<java.lang.Double> zValList)
public static java.util.List<java.lang.String> extraModelColumnNames(java.util.List<java.lang.String> coefNames, GLMModel bestModel)
public static double[][] extractPredSubsetsCPM(double[][] allCPM, int[] predIndices, int[][] pred2CPMIndices, boolean hasIntercept)
public static double[][] extractPredSubsetsCPMFrame(water.fvec.Frame allCPM, int[] predIndices, int[][] pred2CPMIndices, boolean hasIntercept)