Skip to content

Commit

Permalink
GH-15947: fixed column length discrepancies when skipped_columns are …
Browse files Browse the repository at this point in the history
…specified when calling h2o.H2OFrame.

GH-15947: fixed skipped columns for normal import_file path.
GH-15947: added same skipped column capability when transforming R data frames to h2O data frames.
GH-15947: add parameter to avoid R cmd test failure.
GH-15947: Incorporate Seb code review comments.
  • Loading branch information
wendycwong committed May 23, 2024
1 parent a72a974 commit da7946f
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 24 deletions.
11 changes: 6 additions & 5 deletions h2o-py/h2o/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,14 +868,15 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
if ind in skipped_columns:
use_type[ind]=False

if column_names is not None:
if column_names is not None: # used when python object is converted to H2O Frame
if not isinstance(column_names, list): raise ValueError("col_names should be a list")
if (skipped_columns is not None) and len(skipped_columns)>0:
if (len(column_names)) != parse_column_len:
# if ((len(column_names)-len(skipped_columns)) != parse_column_len) and (len(column_names) != parse_column_len):
if (len(column_names)-len(skipped_columns)) != parse_column_len:
raise ValueError(
"length of col_names should be equal to the number of columns parsed: %d vs %d"
% (len(column_names), parse_column_len))
else:
"length of col_names minus length of skipped_columns should equal the number of columns parsed: "
"%d vs %d" % ((len(column_names)-len(skipped_columns), parse_column_len)))
else: # no skipped columns here
if len(column_names) != len(j["column_types"]): raise ValueError(
"length of col_names should be equal to the number of columns: %d vs %d"
% (len(column_names), len(j["column_types"])))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame():
assert dupl4.columns == ["n1", "s1"]


def H2OFrame_skipped_columns_is_BUGGY():
try:
h2o.H2OFrame(data, skipped_columns=[1])
assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947
except ValueError as e:
assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
def H2OFrame_skipped_columns_BUG_fixed():
f1 = h2o.H2OFrame(data, skipped_columns=[1])
f2 = h2o.H2OFrame(data)
assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1))


pu.run_tests([
Expand All @@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY():
H2OFrame_from_pandas,
H2OFrame_from_scipy,
H2OFrame_from_H2OFrame,
H2OFrame_skipped_columns_is_BUGGY
H2OFrame_skipped_columns_BUG_fixed
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils

# Seb has reported that skipped_columns does not work if skipped_columns is called with h2o.H2OFrame
def test_skipped_columns():
data = [[1, 4, "a", 1], [2, 5, "b", 0], [3, 6, "", 1]]
frame = h2o.H2OFrame(data, skipped_columns=[1, 2])
assert frame.ncol == 2, "Expected column number: 2. Actual: {0}".format(frame.ncol)

if __name__ == "__main__":
pyunit_utils.standalone_test(test_skipped_columns)
else:
test_skipped_columns()
22 changes: 14 additions & 8 deletions h2o-r/h2o-package/R/frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -4109,6 +4109,7 @@ use.package <- function(package,
#'
#' @param x An \code{R} object.
#' @param destination_frame A string with the desired name for the H2OFrame
#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame
#' @param use_datatable allow usage of data.table
#' @param \dots arguments passed to method arguments.
#' @export
Expand All @@ -4135,15 +4136,19 @@ use.package <- function(package,
#' stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m))
#' }
#' }
as.h2o <- function(x, destination_frame="", ...) {
as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) {
.key.validate(destination_frame)
UseMethod("as.h2o")
if (is.null(skipped_columns)) {
UseMethod("as.h2o")
} else {
as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
}
}

#' @rdname as.h2o
#' @method as.h2o default
#' @export
as.h2o.default <- function(x, destination_frame="", ...) {
as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = ""))
Expand All @@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) {
data.frame(C1=x)
else
as.data.frame(x, ...)
as.h2o.data.frame(x, destination_frame=destination_frame)
as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
}

#' @rdname as.h2o
#' @method as.h2o H2OFrame
#' @export
as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy")
Expand All @@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
#' @seealso \code{\link{use.package}}
#' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}
#' @export
as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) {
as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame")
Expand Down Expand Up @@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt))
#if (verbose) pt <- proc.time()[[3]] # timings inside
h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types,
col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)))
col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)),
skipped_columns=skipped_columns)
#if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt))
file.remove(tmpf)
h2f
Expand All @@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
#' To speedup execution time for large sparse matrices, use h2o datatable. Make sure you have installed and imported data.table and slam packages.
#' Turn on h2o datatable by options("h2o.use.data.table"=TRUE)
#' @export
as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) {
as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
if( destination_frame=="") {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix")
Expand Down
6 changes: 4 additions & 2 deletions h2o-r/h2o-package/R/parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA
else
col.names
if (!is.null(parseSetup$column_names) &&
(length(parseSetup$column_names) != parsedColLength)) {
stop("length of col.names must equal to the number of columns in dataset")
(length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns
if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength))
|| is.null(skipped_columns)) # if no skipped column, this is an error. If skipped columns, check length
stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
}
# change column names to what the user specified
if (!is.null(skipped_columns)) {
Expand Down
4 changes: 2 additions & 2 deletions h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ test.continuous.or.categorical <- function() {

e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")

# col.types as character vector
df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"),
Expand Down Expand Up @@ -98,7 +98,7 @@ test.continuous.or.categorical <- function() {

e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")

# col.types as character vector
df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"),
Expand Down
10 changes: 10 additions & 0 deletions h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../scripts/h2o-r-test-setup.R")

test.skipped_columns <- function() {
iris_hf <- as.h2o(iris, skipped_columns=c(1,2))
expect_true(ncol(iris_hf) == (ncol(iris)-2))
print("Columns are skipped!!!")
}

doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns)

0 comments on commit da7946f

Please sign in to comment.