Skip to content

Commit

Permalink
Merge pull request #166 from alliander-opensource/feature/lazy-excel-…
Browse files Browse the repository at this point in the history
…loading

Lazy excel loading
  • Loading branch information
bramstoeller committed Mar 2, 2023
2 parents 7e498b3 + 5589888 commit 43c2949
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 37 deletions.
36 changes: 27 additions & 9 deletions src/power_grid_model_io/data_stores/excel_file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from power_grid_model_io.data_stores.base_data_store import BaseDataStore
from power_grid_model_io.data_types import TabularData
from power_grid_model_io.data_types.tabular_data import LazyDataFrame


class ExcelFileStore(BaseDataStore[TabularData]):
Expand All @@ -24,7 +25,7 @@ class ExcelFileStore(BaseDataStore[TabularData]):
same values) or renamed.
"""

__slots__ = ("_file_paths", "_header_rows")
__slots__ = ("_file_paths", "_excel_files", "_header_rows")

_unnamed_pattern: re.Pattern = re.compile(r"Unnamed: \d+_level_\d+")

Expand All @@ -34,6 +35,7 @@ def __init__(self, file_path: Optional[Path] = None, **extra_paths: Path):
# Create a dictionary of all supplied file paths:
# {"": file_path, extra_name[0]: extra_path[0], extra_name[1]: extra_path[1], ...}
self._file_paths: Dict[str, Path] = {}
self._excel_files: Dict[str, pd.ExcelFile] = {}
if file_path is not None:
self._file_paths[""] = file_path
for name, path in extra_paths.items():
Expand Down Expand Up @@ -62,21 +64,37 @@ def load(self) -> TabularData:
have no prefix, while the tables of all the extra files will be prefixed with the name of the key word argument
as supplied in the constructor.
"""
data: Dict[str, pd.DataFrame] = {}
data: Dict[str, LazyDataFrame] = {}
for name, path in self._file_paths.items():
with path.open(mode="rb") as file_pointer:
spreadsheet = pd.read_excel(io=file_pointer, sheet_name=None, header=self._header_rows)
for sheet_name, sheet_data in spreadsheet.items():
sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data)
sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=sheet_name)
self._excel_files[name] = pd.ExcelFile(path)
for sheet_name in self._excel_files[name].sheet_names:
loader = self._load_sheet_wrapper(name, sheet_name)
if name:
sheet_name = f"{name}.{sheet_name}"
if sheet_name in data:
raise ValueError(f"Duplicate sheet name '{sheet_name}'")
data[sheet_name] = sheet_data

data[sheet_name] = loader
return TabularData(**data)

def _load_sheet_wrapper(self, name: str, sheet_name: str):
"""
Load a single Excel sheet as a Pandas DataFrame.
Args:
name: the name of the file (empty string for the main sheet)
sheet_name: the name of the sheet
Returns: The contents the specified Excel sheet.
"""

def wrapper():
sheet_data = self._excel_files[name].parse(sheet_name, header=self._header_rows)
sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data)
sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=sheet_name)
return sheet_data

return wrapper

def save(self, data: TabularData) -> None:
"""
Store tabular data as one or more Excel file.
Expand Down
41 changes: 19 additions & 22 deletions tests/unit/data_stores/test_excel_file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from power_grid_model_io.data_stores.excel_file_store import ExcelFileStore
from power_grid_model_io.data_types.tabular_data import TabularData

from ...utils import assert_log_exists
from ...utils import MockExcelFile, assert_log_exists

PandasExcelData = Dict[str, pd.DataFrame]

Expand Down Expand Up @@ -103,55 +103,57 @@ def test_files__read_only():

@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._handle_duplicate_columns")
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._remove_unnamed_column_placeholders")
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
def test_load(
mock_read_excel: MagicMock,
mock_excel_file: MagicMock,
mock_remove_unnamed_column_placeholders: MagicMock,
mock_handle_duplicate_columns: MagicMock,
objects_excel: PandasExcelData,
):
# Arrange
fs = ExcelFileStore(file_path=Path("input_data.xlsx"))
mock_read_excel.return_value = objects_excel
mock_excel_file.return_value = MockExcelFile(objects_excel)
mock_remove_unnamed_column_placeholders.side_effect = noop
mock_handle_duplicate_columns.side_effect = noop

# Act
data = fs.load()

# Assert
mock_read_excel.assert_called_once()
mock_excel_file.assert_called_once()
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
assert mock_remove_unnamed_column_placeholders.call_args_list[0] == call(data=objects_excel["Nodes"])
assert mock_remove_unnamed_column_placeholders.call_args_list[1] == call(data=objects_excel["Lines"])
assert mock_handle_duplicate_columns.call_args_list[0] == call(data=objects_excel["Nodes"], sheet_name="Nodes")
assert mock_handle_duplicate_columns.call_args_list[1] == call(data=objects_excel["Lines"], sheet_name="Lines")
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])


@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._handle_duplicate_columns")
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._remove_unnamed_column_placeholders")
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
def test_load__extra(
mock_read_excel: MagicMock,
mock_excel_file: MagicMock,
mock_remove_unnamed_column_placeholders: MagicMock,
mock_handle_duplicate_columns: MagicMock,
objects_excel: PandasExcelData,
specs_excel: PandasExcelData,
):

# Arrange
fs = ExcelFileStore(Path("input_data.xlsx"), foo=Path("foo_types.xlsx"))
mock_read_excel.side_effect = (objects_excel, specs_excel)
mock_excel_file.side_effect = (MockExcelFile(objects_excel), MockExcelFile(specs_excel))
mock_remove_unnamed_column_placeholders.side_effect = noop
mock_handle_duplicate_columns.side_effect = noop

# Act
data = fs.load()

# Assert
assert mock_read_excel.call_count == 2
assert mock_excel_file.call_count == 2
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
pd.testing.assert_frame_equal(data["foo.Colors"], specs_excel["Colors"])
pd.testing.assert_frame_equal(data["foo.Lines"], specs_excel["Lines"])
assert mock_remove_unnamed_column_placeholders.call_args_list[0] == call(data=objects_excel["Nodes"])
assert mock_remove_unnamed_column_placeholders.call_args_list[1] == call(data=objects_excel["Lines"])
assert mock_remove_unnamed_column_placeholders.call_args_list[2] == call(data=specs_excel["Colors"])
Expand All @@ -160,26 +162,21 @@ def test_load__extra(
assert mock_handle_duplicate_columns.call_args_list[1] == call(data=objects_excel["Lines"], sheet_name="Lines")
assert mock_handle_duplicate_columns.call_args_list[2] == call(data=specs_excel["Colors"], sheet_name="Colors")
assert mock_handle_duplicate_columns.call_args_list[3] == call(data=specs_excel["Lines"], sheet_name="Lines")
pd.testing.assert_frame_equal(data["Nodes"], objects_excel["Nodes"])
pd.testing.assert_frame_equal(data["Lines"], objects_excel["Lines"])
pd.testing.assert_frame_equal(data["foo.Colors"], specs_excel["Colors"])
pd.testing.assert_frame_equal(data["foo.Lines"], specs_excel["Lines"])


@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._handle_duplicate_columns")
@patch("power_grid_model_io.data_stores.excel_file_store.ExcelFileStore._remove_unnamed_column_placeholders")
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
def test_load__extra__duplicate_sheet_name(
mock_read_excel: MagicMock,
mock_excel_file: MagicMock,
mock_remove_unnamed_column_placeholders: MagicMock,
mock_handle_duplicate_columns: MagicMock,
):
# Arrange
foo_data = {"bar.Nodes": pd.DataFrame()}
bar_data = {"Nodes": pd.DataFrame()}
fs = ExcelFileStore(Path("foo.xlsx"), bar=Path("bar.xlsx"))
mock_read_excel.side_effect = (foo_data, bar_data)
mock_excel_file.side_effect = (MockExcelFile(foo_data), MockExcelFile(bar_data))
mock_remove_unnamed_column_placeholders.side_effect = noop
mock_handle_duplicate_columns.side_effect = noop

Expand Down
12 changes: 6 additions & 6 deletions tests/unit/data_stores/test_vision_excel_file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
from power_grid_model_io.data_stores.vision_excel_file_store import VisionExcelFileStore


@patch("power_grid_model_io.data_stores.excel_file_store.pd.read_excel")
@patch("power_grid_model_io.data_stores.excel_file_store.pd.ExcelFile")
@patch("power_grid_model_io.data_stores.excel_file_store.Path.open", mock_open())
def test_header_rows(read_excel_mock: MagicMock):
def test_header_rows(mock_excel_file: MagicMock):
# Arrange
store = VisionExcelFileStore(file_path=Path("dummy.xlsx"))
read_excel_mock.return_value = {}
mock_excel_file.return_value.sheet_names = ["foo"]

# Act
store.load()
data = store.load()
data["foo"]

# Assert
read_excel_mock.assert_called_once()
assert read_excel_mock.call_args_list[0].kwargs["header"] == [0, 1]
mock_excel_file.return_value.parse.assert_called_once_with("foo", header=[0, 1])
12 changes: 12 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,15 @@ def __len__(self):

def __getitem__(self, item: str):
return MockVal(pd.Series(name=item, dtype=np.float64))


class MockExcelFile:
def __init__(self, data: Dict[str, pd.DataFrame]):
self.data = data

@property
def sheet_names(self) -> List[str]:
return list(self.data.keys())

def parse(self, sheet_name: str, **_kwargs) -> pd.DataFrame:
return self.data[sheet_name]

0 comments on commit 43c2949

Please sign in to comment.