Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for issue skipped_columns not working #15947 #16071

Closed
wants to merge 11 commits into from
47 changes: 34 additions & 13 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,42 @@ def _upload_python_object(self, python_obj, destination_frame=None, header=0, se
if not column_names:
column_names = col_header

if skipped_columns:
if isinstance(skipped_columns[0], str):
skipped_colsidx = []
for col_name in skipped_columns:
if col_name in column_names:
skipped_colsidx.append(column_names.index(col_name))
else:
raise H2OValueError("`skipped_columns` must be valid column names")

skipped_columns = skipped_colsidx

if column_names == col_header:
column_names = column_names[:len(column_names)-len(skipped_columns)]
else:
column_names = [column_names[index] for index in range(len(column_names)) if index not in skipped_columns]

if not column_names:
raise H2OValueError("`skipped_columns` cannot contain all columns")

# create a temporary file that will be written to
tmp_handle, tmp_path = tempfile.mkstemp(suffix=".csv")
tmp_file = os.fdopen(tmp_handle, 'w', **H2OFrame.__fdopen_kwargs)
# create a new csv writer object thingy
csv_writer = csv.writer(tmp_file, dialect="excel", quoting=csv.QUOTE_NONNUMERIC)
csv_writer.writerow(column_names)
if data_to_write and isinstance(data_to_write[0], dict):
for row in data_to_write:
csv_writer.writerow([row.get(k, None) for k in col_header])
else:
csv_writer.writerows(data_to_write)
tmp_file.close() # close the streams
self._upload_parse(tmp_path, destination_frame, 1, separator, column_names, column_types, na_strings,
skipped_columns, force_col_types)
os.remove(tmp_path) # delete the tmp file
try:
tmp_file = os.fdopen(tmp_handle, 'w', **H2OFrame.__fdopen_kwargs)
# create a new csv writer object thingy
csv_writer = csv.writer(tmp_file, dialect="excel", quoting=csv.QUOTE_NONNUMERIC)
csv_writer.writerow(column_names)
if data_to_write and isinstance(data_to_write[0], dict):
for row in data_to_write:
csv_writer.writerow([row.get(k, None) for k in col_header])
else:
csv_writer.writerows(data_to_write)
tmp_file.close() # close the streams
self._upload_parse(tmp_path, destination_frame, 1, separator, column_names, column_types, na_strings,
skipped_columns, force_col_types)
finally:
os.remove(tmp_path) # delete the tmp file

def _upload_sparse_matrix(self, matrix, destination_frame=None):
import scipy.sparse as sp
Expand Down
69 changes: 64 additions & 5 deletions h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,68 @@ def H2OFrame_from_H2OFrame():
assert dupl4.columns == ["n1", "s1"]


def H2OFrame_skipped_columns_is_BUGGY():
def H2OFrame_skipped_columns():
fr = h2o.H2OFrame([
[1, "a", 1],
[2, "b", 0],
[3, "", 1],
]
)
skipped_columns_frame = h2o.H2OFrame(data, skipped_columns=[1])
assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947

def H2OFrame_column_names_with_skipped_columns():
skipped_columns_frame = h2o.H2OFrame(
[
[1, 2, 3],
[4, 5, 6],
],
column_names=['a', 'b', 'c'],
skipped_columns=[0, 1]
)

fr = h2o.H2OFrame(
[
[3],
[6],
],
column_names=['c'],
)

assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947

def H2OFrame_column_names_with_str_skipped_columns():
skipped_columns_frame = h2o.H2OFrame(
[
[1, 2, 3],
[4, 5, 6],
],
column_names=['a', 'b', 'c'],
skipped_columns=['a', 'b']
)

fr = h2o.H2OFrame(
[
[3],
[6],
],
column_names=['c'],
)

assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947

def H2OFrame_str_skipped_columns_without_column_names():
try:
h2o.H2OFrame(data, skipped_columns=[1])
assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947
invalid_frame = h2o.H2OFrame(
[
[1, 2, 3],
[4, 5, 6],
],
skipped_columns=['a', 'b']
)

except ValueError as e:
assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
assert "`skipped_columns` must be valid column names" in str(e)


pu.run_tests([
Expand All @@ -141,5 +197,8 @@ def H2OFrame_skipped_columns_is_BUGGY():
H2OFrame_from_pandas,
H2OFrame_from_scipy,
H2OFrame_from_H2OFrame,
H2OFrame_skipped_columns_is_BUGGY
H2OFrame_skipped_columns,
H2OFrame_column_names_with_skipped_columns,
H2OFrame_column_names_with_str_skipped_columns,
H2OFrame_str_skipped_columns_without_column_names
])
2 changes: 1 addition & 1 deletion h2o-r/h2o-package/R/frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2902,7 +2902,7 @@ h2o.cor <- function(x, y=NULL,na.rm = FALSE, use, method="Pearson"){
if (is.null(method) || is.na(method)) {
stop("Correlation method must be specified.")
}

# Eager, mostly to match prior semantics but no real reason it need to be
expr <- .newExpr("cor",x,y,.quote(use), .quote(method))
if( (nrow(x)==1L || (ncol(x)==1L && ncol(y)==1L)) ) .eval.scalar(expr)
Expand Down