fixes

Signed-off-by: Anatoly Myachev <[email protected]>
modin-project · Mar 2, 2024 · 3b8fa14 · 3b8fa14
1 parent ac8c9cb
commit 3b8fa14
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 190 deletions.
diff --git a/...n/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/...n/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
@@ -18,14 +18,14 @@
 import pandas
 import pyarrow
 import pytest
-from pandas._testing import ensure_clean
 from pandas.core.dtypes.common import is_list_like
 from pyhdk import __version__ as hdk_version
 
 from modin.config import StorageFormat
 from modin.pandas.test.utils import (
  create_test_dfs,
  default_to_pandas_ignore_string,
+ get_unique_filename,
  io_ops_bad_exc,
  random_state,
  test_data,
@@ -324,17 +324,17 @@ def test_read_csv_datetime(
 
  @pytest.mark.parametrize("engine", [None, "arrow"])
  @pytest.mark.parametrize("parse_dates", [None, True, False])
- def test_read_csv_datetime_tz(self, engine, parse_dates):
- with ensure_clean(".csv") as file:
-  with open(file, "w") as f:
-  f.write("test\n2023-01-01T00:00:00.000-07:00")
+ def test_read_csv_datetime_tz(self, engine, parse_dates, tmp_path):
+ unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+ with open(unique_filename, "w") as f:
+ f.write("test\n2023-01-01T00:00:00.000-07:00")
 
-  eval_io(
-  fn_name="read_csv",
-  filepath_or_buffer=file,
-  md_extra_kwargs={"engine": engine},
-  parse_dates=parse_dates,
-  )
+ eval_io(
+ fn_name="read_csv",
+ filepath_or_buffer=unique_filename,
+ md_extra_kwargs={"engine": engine},
+ parse_dates=parse_dates,
+ )
 
  @pytest.mark.parametrize("engine", [None, "arrow"])
  @pytest.mark.parametrize(
@@ -382,26 +382,26 @@ def test_read_csv_col_handling(
  "c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2",
  ],
  )
- def test_read_csv_duplicate_cols(self, cols):
+ def test_read_csv_duplicate_cols(self, cols, tmp_path):
  def test(df, lib, **kwargs):
  data = f"{cols}\n"
- with ensure_clean(".csv") as fname:
-  with open(fname, "w") as f:
-  f.write(data)
-  return lib.read_csv(fname)
+ unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+ with open(unique_filename, "w") as f:
+ f.write(data)
+ return lib.read_csv(unique_filename)
 
  run_and_compare(test, data={})
 
- def test_read_csv_dtype_object(self):
+ def test_read_csv_dtype_object(self, tmp_path):
  with pytest.warns(UserWarning) as warns:
- with ensure_clean(".csv") as file:
-  with open(file, "w") as f:
-  f.write("test\ntest")
+ unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+ with open(unique_filename, "w") as f:
+ f.write("test\ntest")
 
-  def test(**kwargs):
-  return pd.read_csv(file, dtype={"test": "object"})
+ def test(**kwargs):
+ return pd.read_csv(unique_filename, dtype={"test": "object"})
 
-  run_and_compare(test, data={})
+ run_and_compare(test, data={})
  for warn in warns.list:
  assert not re.match(r".*defaulting to pandas.*", str(warn))
 
@@ -870,30 +870,30 @@ def concat(df1, df2, lib, **kwargs):
  @pytest.mark.parametrize("transform", [True, False])
  @pytest.mark.parametrize("sort_last", [True, False])
  # RecursionError in case of concatenation of big number of frames
- def test_issue_5889(self, transform, sort_last):
- with ensure_clean(".csv") as file:
-  data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
-  pandas.DataFrame(data).to_csv(file, index=False)
+ def test_issue_5889(self, transform, sort_last, tmp_path):
+ unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+ data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
+ pandas.DataFrame(data).to_csv(unique_filename, index=False)
 
-  def test_concat(lib, **kwargs):
-  if transform:
+ def test_concat(lib, **kwargs):
+ if transform:
 
-  def read_csv():
-  return lib.read_csv(file)["b"]
+ def read_csv():
+ return lib.read_csv(unique_filename)["b"]
 
-  else:
+ else:
 
-  def read_csv():
-  return lib.read_csv(file)
+ def read_csv():
+ return lib.read_csv(unique_filename)
 
-  df = read_csv()
-  for _ in range(100):
-  df = lib.concat([df, read_csv()])
-  if sort_last:
-  df = lib.concat([df, read_csv()], sort=True)
-  return df
+ df = read_csv()
+ for _ in range(100):
+ df = lib.concat([df, read_csv()])
+ if sort_last:
+ df = lib.concat([df, read_csv()], sort=True)
+ return df
 
-  run_and_compare(test_concat, data={})
+ run_and_compare(test_concat, data={})
 
 
 class TestGroupby:

diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py
@@ -18,13 +18,13 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._testing import ensure_clean
 
 import modin.experimental.pandas as pd
-from modin.config import AsyncReadMode, Engine
+from modin.config import Engine
 from modin.pandas.test.utils import (
  df_equals,
  eval_general,
+ get_unique_filename,
  parse_dates_values_by_id,
  test_data,
  time_parsing_csv_path,
@@ -355,7 +355,7 @@ def test_xml_glob(tmp_path, filename):
  reason=f"{Engine.get()} does not have experimental read_custom_text API",
 )
 @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-def test_read_custom_json_text(set_async_read_mode):
+def test_read_custom_json_text(set_async_read_mode, tmp_path):
  def _generate_json(file_name, nrows, ncols):
  data = np.random.rand(nrows, ncols)
  df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)])
@@ -374,33 +374,27 @@ def _custom_parser(io_input, **kwargs):
  result[key].append(obj[key])
  return pandas.DataFrame(result).rename(columns={"col0": "testID"})
 
- with ensure_clean() as filename:
-  _generate_json(filename, 64, 8)
+ unique_filename = get_unique_filename(data_dir=tmp_path)
+ _generate_json(unique_filename, 64, 8)
 
- df1 = pd.read_custom_text(
- filename,
- columns=["testID", "col1", "col3"],
- custom_parser=_custom_parser,
- is_quoting=False,
- )
- df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename(
- columns={"col0": "testID"}
- )
- if AsyncReadMode.get():
- # If read operations are asynchronous, then the dataframes
- # check should be inside `ensure_clean` context
- # because the file may be deleted before actual reading starts
- df_equals(df1, df2)
- if not AsyncReadMode.get():
- df_equals(df1, df2)
+ df1 = pd.read_custom_text(
+ unique_filename,
+ columns=["testID", "col1", "col3"],
+ custom_parser=_custom_parser,
+ is_quoting=False,
+ )
+ df2 = pd.read_json(unique_filename, lines=True)[["col0", "col1", "col3"]].rename(
+ columns={"col0": "testID"}
+ )
+ df_equals(df1, df2)
 
 
 @pytest.mark.skipif(
  Engine.get() not in ("Ray", "Unidist", "Dask"),
  reason=f"{Engine.get()} does not have experimental API",
 )
 @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-def test_read_evaluated_dict(set_async_read_mode):
+def test_read_evaluated_dict(set_async_read_mode, tmp_path):
  def _generate_evaluated_dict(file_name, nrows, ncols):
  result = {}
  keys = [f"col{x}" for x in range(ncols)]
@@ -430,23 +424,17 @@ def columns_callback(io_input, **kwargs):
  break
  return columns
 
- with ensure_clean() as filename:
-  _generate_evaluated_dict(filename, 64, 8)
+ unique_filename = get_unique_filename(data_dir=tmp_path)
+ _generate_evaluated_dict(unique_filename, 64, 8)
 
-  df1 = pd.read_custom_text(
-  filename,
-  columns=["col1", "col2"],
-  custom_parser=_custom_parser,
-  )
-  assert df1.shape == (64, 2)
+ df1 = pd.read_custom_text(
+ unique_filename,
+ columns=["col1", "col2"],
+ custom_parser=_custom_parser,
+ )
+ assert df1.shape == (64, 2)
 
- df2 = pd.read_custom_text(
- filename, columns=columns_callback, custom_parser=_custom_parser
- )
- if AsyncReadMode.get():
- # If read operations are asynchronous, then the dataframes
- # check should be inside `ensure_clean` context
- # because the file may be deleted before actual reading starts
- df_equals(df1, df2)
- if not AsyncReadMode.get():
- df_equals(df1, df2)
+ df2 = pd.read_custom_text(
+ unique_filename, columns=columns_callback, custom_parser=_custom_parser
+ )
+ df_equals(df1, df2)
diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._testing import ensure_clean
 from pandas.testing import assert_index_equal
 
 import modin.pandas as pd
@@ -35,6 +34,7 @@
  df_equals,
  eval_general,
  generate_multiindex,
+ get_unique_filename,
  int_arg_keys,
  int_arg_values,
  name_contains,
@@ -2207,14 +2207,16 @@ def test___setitem__partitions_aligning():
  df_equals(md_df, pd_df)
 
 
-def test___setitem__with_mismatched_partitions():
- with ensure_clean(".csv") as fname:
- np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
- modin_df = pd.read_csv(fname)
- pandas_df = pandas.read_csv(fname)
- modin_df["new"] = pd.Series(list(range(len(modin_df))))
- pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
- df_equals(modin_df, pandas_df)
+def test___setitem__with_mismatched_partitions(tmp_path):
+ unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+ np.savetxt(
+ unique_filename, np.random.randint(0, 100, size=(200_000, 99)), delimiter=","
+ )
+ modin_df = pd.read_csv(unique_filename)
+ pandas_df = pandas.read_csv(unique_filename)
+ modin_df["new"] = pd.Series(list(range(len(modin_df))))
+ pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
+ df_equals(modin_df, pandas_df)
 
 
 def test___setitem__mask():