Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-15947: fixed skipped_column error in Python #16164

Merged
merged 3 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 6 additions & 5 deletions h2o-py/h2o/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,14 +868,15 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
if ind in skipped_columns:
use_type[ind]=False

if column_names is not None:
if column_names is not None: # used when python object is converted to H2O Frame
if not isinstance(column_names, list): raise ValueError("col_names should be a list")
if (skipped_columns is not None) and len(skipped_columns)>0:
if (len(column_names)) != parse_column_len:
# if ((len(column_names)-len(skipped_columns)) != parse_column_len) and (len(column_names) != parse_column_len):
if (len(column_names)-len(skipped_columns)) != parse_column_len:
raise ValueError(
"length of col_names should be equal to the number of columns parsed: %d vs %d"
% (len(column_names), parse_column_len))
else:
"length of col_names minus length of skipped_columns should equal the number of columns parsed: "
"%d vs %d" % ((len(column_names)-len(skipped_columns), parse_column_len)))
else: # no skipped columns here
if len(column_names) != len(j["column_types"]): raise ValueError(
"length of col_names should be equal to the number of columns: %d vs %d"
% (len(column_names), len(j["column_types"])))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame():
assert dupl4.columns == ["n1", "s1"]


def H2OFrame_skipped_columns_is_BUGGY():
try:
h2o.H2OFrame(data, skipped_columns=[1])
assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947
except ValueError as e:
assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
def H2OFrame_skipped_columns_BUG_fixed():
f1 = h2o.H2OFrame(data, skipped_columns=[1])
f2 = h2o.H2OFrame(data)
assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1))


pu.run_tests([
Expand All @@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY():
H2OFrame_from_pandas,
H2OFrame_from_scipy,
H2OFrame_from_H2OFrame,
H2OFrame_skipped_columns_is_BUGGY
H2OFrame_skipped_columns_BUG_fixed
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils

# Seb has reported that skipped_columns does not work if skipped_columns is called with h2o.H2OFrame
def test_skipped_columns():
data = [[1, 4, "a", 1], [2, 5, "b", 0], [3, 6, "", 1]]
frame = h2o.H2OFrame(data, skipped_columns=[1, 2])
assert frame.ncol == 2, "Expected column number: 2. Actual: {0}".format(frame.ncol)

if __name__ == "__main__":
pyunit_utils.standalone_test(test_skipped_columns)
else:
test_skipped_columns()
22 changes: 14 additions & 8 deletions h2o-r/h2o-package/R/frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -4109,6 +4109,7 @@ use.package <- function(package,
#'
#' @param x An \code{R} object.
#' @param destination_frame A string with the desired name for the H2OFrame
#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame
#' @param use_datatable allow usage of data.table
#' @param \dots arguments passed to method arguments.
#' @export
Expand All @@ -4135,15 +4136,19 @@ use.package <- function(package,
#' stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m))
#' }
#' }
as.h2o <- function(x, destination_frame="", ...) {
as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) {
.key.validate(destination_frame)
UseMethod("as.h2o")
if (is.null(skipped_columns)) {
UseMethod("as.h2o")
} else {
as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
}
}

#' @rdname as.h2o
#' @method as.h2o default
#' @export
as.h2o.default <- function(x, destination_frame="", ...) {
as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = ""))
Expand All @@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) {
data.frame(C1=x)
else
as.data.frame(x, ...)
as.h2o.data.frame(x, destination_frame=destination_frame)
as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
}

#' @rdname as.h2o
#' @method as.h2o H2OFrame
#' @export
as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy")
Expand All @@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
#' @seealso \code{\link{use.package}}
#' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}
#' @export
as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) {
as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
if( destination_frame=="" ) {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame")
Expand Down Expand Up @@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt))
#if (verbose) pt <- proc.time()[[3]] # timings inside
h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types,
col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)))
col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)),
skipped_columns=skipped_columns)
#if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt))
file.remove(tmpf)
h2f
Expand All @@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
#' To speedup execution time for large sparse matrices, use h2o datatable. Make sure you have installed and imported data.table and slam packages.
#' Turn on h2o datatable by options("h2o.use.data.table"=TRUE)
#' @export
as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) {
as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
if( destination_frame=="") {
subx <- destination_frame.guess(deparse(substitute(x)))
destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix")
Expand Down
6 changes: 4 additions & 2 deletions h2o-r/h2o-package/R/parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA
else
col.names
if (!is.null(parseSetup$column_names) &&
(length(parseSetup$column_names) != parsedColLength)) {
stop("length of col.names must equal to the number of columns in dataset")
(length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns
if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength))
|| is.null(skipped_columns)) # if no skipped column, this is an error. If skipped columns, check length
stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
}
# change column names to what the user specified
if (!is.null(skipped_columns)) {
Expand Down
4 changes: 2 additions & 2 deletions h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ test.continuous.or.categorical <- function() {

e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")

# col.types as character vector
df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"),
Expand Down Expand Up @@ -98,7 +98,7 @@ test.continuous.or.categorical <- function() {

e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")

# col.types as character vector
df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"),
Expand Down
10 changes: 10 additions & 0 deletions h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../scripts/h2o-r-test-setup.R")

test.skipped_columns <- function() {
iris_hf <- as.h2o(iris, skipped_columns=c(1,2))
expect_true(ncol(iris_hf) == (ncol(iris)-2))
print("Columns are skipped!!!")
}

doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns)