Skip to content

Commit

Permalink
GH-15697 glm mojo offset fold predict error (#16159)
Browse files Browse the repository at this point in the history
* GH-15697: add R test to reproduce error.

* add generic model to test as well.
* Add fold column info to generic model.
* add compareFrame to compare prediction results.
* add prediction with fold column removed

Co-authored-by: wendycwong <[email protected]>
Co-authored-by: Veronika Maurerová <[email protected]>
  • Loading branch information
3 people committed Apr 19, 2024
1 parent 054a599 commit 0c60e3e
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,9 @@ private void addCommonModelInfo() throws IOException {
if (model.offsetColumn() != null) {
writekv("offset_column", model.offsetColumn());
}
if (model.foldColumn() != null) {
writekv("fold_column", model.foldColumn());
}
writekv("balance_classes", model.balanceClasses());
writekv("default_threshold", model.defaultThreshold());
writekv("prior_class_distrib", Arrays.toString(model.priorClassDist()));
Expand Down
1 change: 1 addition & 0 deletions h2o-genmodel/src/main/java/hex/genmodel/GenModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public abstract class GenModel implements IGenModel, IGeneratedModel, Serializab

/** Name of the column with offsets (used for certain types of models). */
public String _offsetColumn;
public String _foldColumn;

/** Name of the column determine treatment group, currently only for UpliftDRF models */
public String _treatmentColumn;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ private void readAll(final boolean readModelMetadata) throws IOException {
_model._priorClassDistrib = readkv("prior_class_distrib");
_model._modelClassDistrib = readkv("model_class_distrib");
_model._offsetColumn = readkv("offset_column");
_model._foldColumn = readkv("fold_column");
_model._mojo_version = ((Number) readkv("mojo_version")).doubleValue();
checkMaxSupportedMojoVersion();
readModelData();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public static class MojoModelDescriptor implements ModelDescriptor, Serializable
private final double[] _priorClassDistrib;
private final double[] _modelClassDistrib;
private final String _offsetColumn;
private final String _foldColumn;
private final String _weightsColumn;
private final String _treatmentColumn;
private final String[][] _domains;
Expand All @@ -65,6 +66,7 @@ private MojoModelDescriptor(final MojoModel mojoModel, final String fullAlgorith
_modelClassDistrib = mojoModel._modelClassDistrib;
_h2oVersion = mojoModel._h2oVersion;
_offsetColumn = mojoModel._offsetColumn;
_foldColumn = mojoModel._foldColumn;
_domains = mojoModel._domains;
_origDomains = mojoModel.getOrigDomainValues();
_names = mojoModel._names;
Expand Down Expand Up @@ -116,7 +118,7 @@ public String weightsColumn() {

@Override
public String foldColumn() {
return null;
return _foldColumn;
}

@Override
Expand Down
63 changes: 63 additions & 0 deletions h2o-r/tests/testdir_algos/glm/runit_gh_15697_mojo_offset_cv.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
library(data.table)

test_glm_mojo_offset_fold_columns <- function() {
mt <- as.h2o(mtcars)

find_offset <- h2o.glm(
x = c("cyl", "disp"),
y = "mpg",
training_frame = mt,
lambda = 0
)

h2o.residual_deviance(find_offset)
# [1] 270.7403

# create offset to look exactly like it's in the model (easy comparison)
mt$offset <- mt$cyl * find_offset@model$coefficients[["cyl"]]
mt$fold <- h2o.kfold_column(data = mt, nfolds = 3, seed = 123)

# move "cyl" from being modeled to be an offset
# build with a fold column
mod_w_offset <- h2o.glm(
x = c("disp"),
y = "mpg",
training_frame = mt,
offset = "offset",
lambda = 0,
fold_column = "fold"
)

h2o.residual_deviance(mod_w_offset)
# [1] 270.7403 (match as expected)

# save out models then immediately reimport
mojo_path <- h2o.save_mojo(object = mod_w_offset, path = ".")
biny_path <- h2o.saveModel(object = mod_w_offset, path = ".")

mojo <- h2o.import_mojo(mojo_file_path = mojo_path)
genericGLM <- h2o.genericModel(mojo_path)
biny <- h2o.loadModel(path = biny_path)


predict <- h2o.predict(object = mod_w_offset, newdata = mt)
predict0 <- h2o.predict(object = mojo, newdata = mt)
predict1 <- h2o.predict(object = biny, newdata = mt)
predict2 <- h2o.predict(object = genericGLM, newdata = mt)

# check all predicts match
compareFrames(predict, predict0)
compareFrames(predict, predict1)
compareFrames(predict, predict2)

# remove the fold column in the dataset for prediction
predict3 <- h2o.predict(object = mojo, newdata = mt[-c(13)])
predict4 <- h2o.predict(object = genericGLM, newdata = mt[-c(13)])
# make sure prediction still matches after removing fold column when calling predict
compareFrames(predict, predict3)
compareFrames(predict, predict4)
}

doTest("Fix GLM mojo with offset and fold column", test_glm_mojo_offset_fold_columns)

0 comments on commit 0c60e3e

Please sign in to comment.