From 574cf6fd214e4efd623891a2eaf2d123288db002 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 19:56:36 +0200 Subject: [PATCH 1/5] CSV data store (beta) Signed-off-by: Bram Stoeller --- .../data_stores/csv_dir_store.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/power_grid_model_io/data_stores/csv_dir_store.py diff --git a/src/power_grid_model_io/data_stores/csv_dir_store.py b/src/power_grid_model_io/data_stores/csv_dir_store.py new file mode 100644 index 00000000..fa510b94 --- /dev/null +++ b/src/power_grid_model_io/data_stores/csv_dir_store.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +""" +CSV Directory Store +""" + +from pathlib import Path +from typing import Any, Dict, List + +import pandas as pd + +from power_grid_model_io.data_stores.base_data_store import BaseDataStore +from power_grid_model_io.data_types import TabularData + + +class CsvDirStore(BaseDataStore[TabularData]): + """ + CSV Directory Store + + The first row of each .csv file is expected to contain the column names, unless specified differently by an + extension of this class. + """ + + __slots__ = ("_dir_path", "_csv_kwargs", "_header_rows") + + def __init__(self, dir_path: Path, **csv_kwargs): + super().__init__() + self._dir_path = dir_path + self._csv_kwargs: Dict[str, Any] = csv_kwargs + self._header_rows: List[int] = [0] + + def load(self) -> TabularData: + """ + Load all CSV files in a directory as tabular data. + """ + data: Dict[str, pd.DataFrame] = {} + for path in self._dir_path.glob("*.csv"): + data[path.stem] = pd.read_csv(filepath_or_buffer=path, header=self._header_rows, **self._csv_kwargs) + + return TabularData(**data) + + def save(self, data: TabularData) -> None: + """ + Store each table in data as a separate CSV file + """ + for table_name, table_data in data.items(): + table_data.to_csv(path_or_buf=self._dir_path / f"{table_name}.csv", **self._csv_kwargs) From d2c15303ffaa244cc34326fdb5f8336e8d43ca94 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 20:27:51 +0200 Subject: [PATCH 2/5] Lazy loading CSV files (or tables in general) Signed-off-by: Bram Stoeller --- .../data_stores/csv_dir_store.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/power_grid_model_io/data_stores/csv_dir_store.py b/src/power_grid_model_io/data_stores/csv_dir_store.py index fa510b94..e442b2c5 100644 --- a/src/power_grid_model_io/data_stores/csv_dir_store.py +++ b/src/power_grid_model_io/data_stores/csv_dir_store.py @@ -6,7 +6,7 @@ """ from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Callable, Dict, List import pandas as pd @@ -26,17 +26,24 @@ class CsvDirStore(BaseDataStore[TabularData]): def __init__(self, dir_path: Path, **csv_kwargs): super().__init__() - self._dir_path = dir_path + self._dir_path = Path(dir_path) self._csv_kwargs: Dict[str, Any] = csv_kwargs self._header_rows: List[int] = [0] def load(self) -> TabularData: """ - Load all CSV files in a directory as tabular data. + Create a lazy loader for all CSV files in a directory and store them in a TabularData instance. """ - data: Dict[str, pd.DataFrame] = {} + + def lazy_csv_loader(csv_path: Path) -> Callable[[], pd.DataFrame]: + def csv_loader(): + return pd.read_csv(filepath_or_buffer=csv_path, header=self._header_rows, **self._csv_kwargs) + + return csv_loader + + data: Dict[str, Callable[[], pd.DataFrame]] = {} for path in self._dir_path.glob("*.csv"): - data[path.stem] = pd.read_csv(filepath_or_buffer=path, header=self._header_rows, **self._csv_kwargs) + data[path.stem] = lazy_csv_loader(path) return TabularData(**data) From 947c77fc7204baacc38460d3c1ab1aebe56e8616 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 2 Mar 2023 14:23:31 +0100 Subject: [PATCH 3/5] Unit tests for csv dir store Signed-off-by: Bram Stoeller --- .../data_stores/csv_dir_store.py | 4 +- .../data_stores/excel_file_store.py | 3 +- .../data_types/__init__.py | 2 +- tests/unit/data_stores/test_csv_dir_store.py | 60 +++++++++++++++++++ 4 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 tests/unit/data_stores/test_csv_dir_store.py diff --git a/src/power_grid_model_io/data_stores/csv_dir_store.py b/src/power_grid_model_io/data_stores/csv_dir_store.py index e442b2c5..62f2d195 100644 --- a/src/power_grid_model_io/data_stores/csv_dir_store.py +++ b/src/power_grid_model_io/data_stores/csv_dir_store.py @@ -11,7 +11,7 @@ import pandas as pd from power_grid_model_io.data_stores.base_data_store import BaseDataStore -from power_grid_model_io.data_types import TabularData +from power_grid_model_io.data_types import LazyDataFrame, TabularData class CsvDirStore(BaseDataStore[TabularData]): @@ -41,7 +41,7 @@ def csv_loader(): return csv_loader - data: Dict[str, Callable[[], pd.DataFrame]] = {} + data: Dict[str, LazyDataFrame] = {} for path in self._dir_path.glob("*.csv"): data[path.stem] = lazy_csv_loader(path) diff --git a/src/power_grid_model_io/data_stores/excel_file_store.py b/src/power_grid_model_io/data_stores/excel_file_store.py index 58adfe2a..4beaafd2 100644 --- a/src/power_grid_model_io/data_stores/excel_file_store.py +++ b/src/power_grid_model_io/data_stores/excel_file_store.py @@ -12,8 +12,7 @@ import pandas as pd from power_grid_model_io.data_stores.base_data_store import BaseDataStore -from power_grid_model_io.data_types import TabularData -from power_grid_model_io.data_types.tabular_data import LazyDataFrame +from power_grid_model_io.data_types import LazyDataFrame, TabularData class ExcelFileStore(BaseDataStore[TabularData]): diff --git a/src/power_grid_model_io/data_types/__init__.py b/src/power_grid_model_io/data_types/__init__.py index a0a45c0a..2cd7fb7a 100644 --- a/src/power_grid_model_io/data_types/__init__.py +++ b/src/power_grid_model_io/data_types/__init__.py @@ -6,4 +6,4 @@ """ from power_grid_model_io.data_types._data_types import ExtraInfo, ExtraInfoLookup, StructuredData -from power_grid_model_io.data_types.tabular_data import TabularData +from power_grid_model_io.data_types.tabular_data import LazyDataFrame, TabularData diff --git a/tests/unit/data_stores/test_csv_dir_store.py b/tests/unit/data_stores/test_csv_dir_store.py new file mode 100644 index 00000000..a6ca754c --- /dev/null +++ b/tests/unit/data_stores/test_csv_dir_store.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project +# +# SPDX-License-Identifier: MPL-2.0 + +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest + +from power_grid_model_io.data_stores.csv_dir_store import CsvDirStore +from power_grid_model_io.data_types import TabularData + + +@pytest.fixture() +def temp_dir(): + with tempfile.TemporaryDirectory() as tmp: + yield Path(tmp).resolve() + + +def touch(file_path: Path): + open(file_path, "wb").close() + + +@patch("power_grid_model_io.data_stores.csv_dir_store.pd.read_csv") +def test_load(mock_read_csv: MagicMock, temp_dir: Path): + # Arrange + foo_data = MagicMock() + bar_data = MagicMock() + touch(temp_dir / "foo.csv") + touch(temp_dir / "bar.csv") + mock_read_csv.side_effect = (foo_data, bar_data) + csv_dir = CsvDirStore(temp_dir, bla=True) + + # Act + csv_data = csv_dir.load() + + # Assert + mock_read_csv.assert_not_called() # The csv data is not yet loaded + assert csv_data["foo"] == foo_data + assert csv_data["bar"] == bar_data + mock_read_csv.assert_any_call(filepath_or_buffer=temp_dir / "foo.csv", header=[0], bla=True) + mock_read_csv.assert_any_call(filepath_or_buffer=temp_dir / "bar.csv", header=[0], bla=True) + + +@patch("power_grid_model_io.data_stores.csv_dir_store.pd.DataFrame.to_csv") +def test_save(mock_to_csv: MagicMock, temp_dir): + # Arrange + foo_data = pd.DataFrame() + bar_data = pd.DataFrame() + data = TabularData(foo=foo_data, bar=bar_data) + csv_dir = CsvDirStore(temp_dir, bla=True) + + # Act + csv_dir.save(data) + + # Assert + mock_to_csv.assert_any_call(path_or_buf=temp_dir / "foo.csv", bla=True) + mock_to_csv.assert_any_call(path_or_buf=temp_dir / "bar.csv", bla=True) From dd152500d9ea071cc273808b31b043476452ce49 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 2 Mar 2023 14:32:11 +0100 Subject: [PATCH 4/5] Use LazyDataFrame Signed-off-by: Bram Stoeller --- src/power_grid_model_io/data_stores/csv_dir_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/power_grid_model_io/data_stores/csv_dir_store.py b/src/power_grid_model_io/data_stores/csv_dir_store.py index 62f2d195..ba390285 100644 --- a/src/power_grid_model_io/data_stores/csv_dir_store.py +++ b/src/power_grid_model_io/data_stores/csv_dir_store.py @@ -35,7 +35,7 @@ def load(self) -> TabularData: Create a lazy loader for all CSV files in a directory and store them in a TabularData instance. """ - def lazy_csv_loader(csv_path: Path) -> Callable[[], pd.DataFrame]: + def lazy_csv_loader(csv_path: Path) -> LazyDataFrame: def csv_loader(): return pd.read_csv(filepath_or_buffer=csv_path, header=self._header_rows, **self._csv_kwargs) From 6a3b43558c67858539000884e8a085889340c0d9 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 2 Mar 2023 14:40:32 +0100 Subject: [PATCH 5/5] Remove unused import Signed-off-by: Bram Stoeller --- src/power_grid_model_io/data_stores/csv_dir_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/power_grid_model_io/data_stores/csv_dir_store.py b/src/power_grid_model_io/data_stores/csv_dir_store.py index ba390285..18f4f27c 100644 --- a/src/power_grid_model_io/data_stores/csv_dir_store.py +++ b/src/power_grid_model_io/data_stores/csv_dir_store.py @@ -6,7 +6,7 @@ """ from pathlib import Path -from typing import Any, Callable, Dict, List +from typing import Any, Dict, List import pandas as pd