Skip to content

Commit

Permalink
Merge pull request #168 from alliander-opensource/feature/lazy-excel-…
Browse files Browse the repository at this point in the history
…loading

Simplify lazy excel loading
  • Loading branch information
bramstoeller committed Mar 2, 2023
2 parents 43c2949 + dbce5e0 commit d758f0e
Showing 1 changed file with 14 additions and 24 deletions.
38 changes: 14 additions & 24 deletions src/power_grid_model_io/data_stores/excel_file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(self, file_path: Optional[Path] = None, **extra_paths: Path):
# Create a dictionary of all supplied file paths:
# {"": file_path, extra_name[0]: extra_path[0], extra_name[1]: extra_path[1], ...}
self._file_paths: Dict[str, Path] = {}
self._excel_files: Dict[str, pd.ExcelFile] = {}
if file_path is not None:
self._file_paths[""] = file_path
for name, path in extra_paths.items():
Expand Down Expand Up @@ -64,37 +63,28 @@ def load(self) -> TabularData:
have no prefix, while the tables of all the extra files will be prefixed with the name of the key word argument
as supplied in the constructor.
"""

def lazy_sheet_loader(xls_file: pd.ExcelFile, xls_sheet_name: str):
def sheet_loader():
sheet_data = xls_file.parse(xls_sheet_name, header=self._header_rows)
sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data)
sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=xls_sheet_name)
return sheet_data

return sheet_loader

data: Dict[str, LazyDataFrame] = {}
for name, path in self._file_paths.items():
self._excel_files[name] = pd.ExcelFile(path)
for sheet_name in self._excel_files[name].sheet_names:
loader = self._load_sheet_wrapper(name, sheet_name)
if name:
excel_file = pd.ExcelFile(path)
for sheet_name in excel_file.sheet_names:
loader = lazy_sheet_loader(excel_file, sheet_name)
if name != "": # If the Excel file is not the main file, prefix the sheet name with the file name
sheet_name = f"{name}.{sheet_name}"
if sheet_name in data:
raise ValueError(f"Duplicate sheet name '{sheet_name}'")
data[sheet_name] = loader
return TabularData(**data)

def _load_sheet_wrapper(self, name: str, sheet_name: str):
"""
Load a single Excel sheet as a Pandas DataFrame.
Args:
name: the name of the file (empty string for the main sheet)
sheet_name: the name of the sheet
Returns: The contents the specified Excel sheet.
"""

def wrapper():
sheet_data = self._excel_files[name].parse(sheet_name, header=self._header_rows)
sheet_data = self._remove_unnamed_column_placeholders(data=sheet_data)
sheet_data = self._handle_duplicate_columns(data=sheet_data, sheet_name=sheet_name)
return sheet_data

return wrapper

def save(self, data: TabularData) -> None:
"""
Store tabular data as one or more Excel file.
Expand Down

0 comments on commit d758f0e

Please sign in to comment.