h2oai · Devanshusisodiya · Feb 11, 2024 · Feb 12, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -146,21 +146,42 @@ def _upload_python_object(self, python_obj, destination_frame=None, header=0, se
  if not column_names:
  column_names = col_header
 
+ if skipped_columns:
+ if isinstance(skipped_columns[0], str):
+ skipped_colsidx = []
+ for col_name in skipped_columns:
+ if col_name in column_names:
+ skipped_colsidx.append(column_names.index(col_name))
+ else:
+ raise H2OValueError("`skipped_columns` must be valid column names")
+
+ skipped_columns = skipped_colsidx
+
+ if column_names == col_header:
+ column_names = column_names[:len(column_names)-len(skipped_columns)]
+ else:
+ column_names = [column_names[index] for index in range(len(column_names)) if index not in skipped_columns]
+
+ if not column_names:
+ raise H2OValueError("`skipped_columns` cannot contain all columns")
+
  # create a temporary file that will be written to
  tmp_handle, tmp_path = tempfile.mkstemp(suffix=".csv")
- tmp_file = os.fdopen(tmp_handle, 'w', **H2OFrame.__fdopen_kwargs)
- # create a new csv writer object thingy
- csv_writer = csv.writer(tmp_file, dialect="excel", quoting=csv.QUOTE_NONNUMERIC)
- csv_writer.writerow(column_names)
- if data_to_write and isinstance(data_to_write[0], dict):
- for row in data_to_write:
- csv_writer.writerow([row.get(k, None) for k in col_header])
- else:
- csv_writer.writerows(data_to_write)
- tmp_file.close() # close the streams
- self._upload_parse(tmp_path, destination_frame, 1, separator, column_names, column_types, na_strings,
- skipped_columns, force_col_types)
- os.remove(tmp_path) # delete the tmp file
+ try:
+ tmp_file = os.fdopen(tmp_handle, 'w', **H2OFrame.__fdopen_kwargs)
+ # create a new csv writer object thingy
+ csv_writer = csv.writer(tmp_file, dialect="excel", quoting=csv.QUOTE_NONNUMERIC)
+ csv_writer.writerow(column_names)
+ if data_to_write and isinstance(data_to_write[0], dict):
+ for row in data_to_write:
+ csv_writer.writerow([row.get(k, None) for k in col_header])
+ else:
+ csv_writer.writerows(data_to_write)
+ tmp_file.close() # close the streams
+ self._upload_parse(tmp_path, destination_frame, 1, separator, column_names, column_types, na_strings,
+ skipped_columns, force_col_types)
+ finally:
+ os.remove(tmp_path) # delete the tmp file
 
  def _upload_sparse_matrix(self, matrix, destination_frame=None):
  import scipy.sparse as sp

diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
@@ -125,12 +125,68 @@ def H2OFrame_from_H2OFrame():
  assert dupl4.columns == ["n1", "s1"]
 
 
-def H2OFrame_skipped_columns_is_BUGGY():
+def H2OFrame_skipped_columns():
+ fr = h2o.H2OFrame([
+ [1, "a", 1],
+ [2, "b", 0],
+ [3, "", 1],
+ ]
+ )
+ skipped_columns_frame = h2o.H2OFrame(data, skipped_columns=[1])
+ assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 
+
+def H2OFrame_column_names_with_skipped_columns():
+ skipped_columns_frame = h2o.H2OFrame(
+ [
+ [1, 2, 3],
+ [4, 5, 6],
+ ],
+ column_names=['a', 'b', 'c'],
+ skipped_columns=[0, 1]
+ )
+
+ fr = h2o.H2OFrame(
+ [
+ [3],
+ [6],
+ ],
+ column_names=['c'],
+ )
+
+ assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 
+
+def H2OFrame_column_names_with_str_skipped_columns():
+ skipped_columns_frame = h2o.H2OFrame(
+ [
+ [1, 2, 3],
+ [4, 5, 6],
+ ],
+ column_names=['a', 'b', 'c'],
+ skipped_columns=['a', 'b']
+ )
+
+ fr = h2o.H2OFrame(
+ [
+ [3],
+ [6],
+ ],
+ column_names=['c'],
+ )
+
+ assert skipped_columns_frame == fr # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 
+
+def H2OFrame_str_skipped_columns_without_column_names():
  try:
- h2o.H2OFrame(data, skipped_columns=[1])
- assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 
+ invalid_frame = h2o.H2OFrame(
+ [
+ [1, 2, 3],
+ [4, 5, 6],
+ ],
+ skipped_columns=['a', 'b']
+ )
+
  except ValueError as e:
- assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
+ assert "`skipped_columns` must be valid column names" in str(e)
 
 
 pu.run_tests([
@@ -141,5 +197,8 @@ def H2OFrame_skipped_columns_is_BUGGY():
  H2OFrame_from_pandas,
  H2OFrame_from_scipy,
  H2OFrame_from_H2OFrame,
- H2OFrame_skipped_columns_is_BUGGY
+ H2OFrame_skipped_columns,
+ H2OFrame_column_names_with_skipped_columns,
+ H2OFrame_column_names_with_str_skipped_columns,
+ H2OFrame_str_skipped_columns_without_column_names
 ])
diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R
@@ -2902,7 +2902,7 @@ h2o.cor <- function(x, y=NULL,na.rm = FALSE, use, method="Pearson"){
  if (is.null(method) || is.na(method)) {
  stop("Correlation method must be specified.")
  }
- 
+
  # Eager, mostly to match prior semantics but no real reason it need to be
  expr <- .newExpr("cor",x,y,.quote(use), .quote(method))
  if( (nrow(x)==1L || (ncol(x)==1L && ncol(y)==1L)) ) .eval.scalar(expr)