Skip to content

Commit

Permalink
Avoid unnecessary write operations: Dont' dump if JSON output didn't …
Browse files Browse the repository at this point in the history
…changed (#384)

* Avoid unnecessary write operations: Dont' dump if nothing changed

the current behaviour is especially annoying in the machine-readable_anwendungshandbuch repository where we get lots of noise in the excel files
see e.g. Hochfrequenz/machine-readable_anwendungshandbuecher#216
or Hochfrequenz/machine-readable_anwendungshandbuecher#217

* shut up mypy

* shutup 2

* Update src/kohlrahbi/ahb/__init__.py

* reduce redundancy in logging
  • Loading branch information
hf-kklein authored Jul 30, 2024
1 parent 88b596f commit 49119be
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 32 deletions.
26 changes: 22 additions & 4 deletions src/kohlrahbi/ahb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)
from kohlrahbi.seed import Seed
from kohlrahbi.unfoldedahb import UnfoldedAhb
from kohlrahbi.unfoldedahb.unfoldedahbtable import are_equal_except_for_guids

_pruefi_pattern = re.compile(r"^[1-9]\d{4}$")

Expand Down Expand Up @@ -55,12 +56,29 @@ def process_ahb_table(
"""
unfolded_ahb = UnfoldedAhb.from_ahb_table(ahb_table=ahb_table, pruefi=pruefi)

try:
json_file_path = unfolded_ahb.get_flatahb_json_file_path(output_path)
excel_file_path = unfolded_ahb.get_xlsx_file_path(output_path)
csv_file_path = unfolded_ahb.get_csv_file_path(output_path)
except ValueError:
logger.warning("Error while determining file paths for pruefi '%s'. Skipping saving files.", pruefi)
return
pruefi_didnt_change_since_last_scraping: Optional[bool] = None
if AhbExportFileFormat.FLATAHB in file_type:
# the flat ahb ist the only file format from which we can READ to compare our current with previous results
if json_file_path.exists():
pruefi_didnt_change_since_last_scraping = are_equal_except_for_guids(unfolded_ahb, json_file_path)
# ⚠ here we assume that the csv/json/xlsx files are in sync, if they exist.
# this means: if the json file didn't change and a csv file exists, we expect the csv file to also be unchanged
if AhbExportFileFormat.XLSX in file_type:
unfolded_ahb.dump_xlsx(output_path)
if not excel_file_path.exists() or not pruefi_didnt_change_since_last_scraping:
unfolded_ahb.dump_xlsx(output_path)
if AhbExportFileFormat.FLATAHB in file_type:
unfolded_ahb.dump_flatahb_json(output_path)
if not json_file_path.exists() or not pruefi_didnt_change_since_last_scraping:
unfolded_ahb.dump_flatahb_json(output_path)
if AhbExportFileFormat.CSV in file_type:
unfolded_ahb.dump_csv(output_path)
if not csv_file_path.exists() or not pruefi_didnt_change_since_last_scraping:
unfolded_ahb.dump_csv(output_path)
del unfolded_ahb


Expand Down Expand Up @@ -103,7 +121,7 @@ def process_pruefi(
"""
Process one pruefi.
If the input path ends with .docx, we assume that the file containing the pruefi is given.
Therefore we only access that file.
Therefore, we only access that file.
"""

# TODO try to cache document objects cause it is slow to read them from disk # pylint: disable=fixme
Expand Down
123 changes: 95 additions & 28 deletions src/kohlrahbi/unfoldedahb/unfoldedahbtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
from functools import lru_cache
from pathlib import Path
from typing import Union
from uuid import uuid4

import attrs
Expand Down Expand Up @@ -368,22 +369,33 @@ def convert_to_flat_ahb(self) -> FlatAnwendungshandbuch:
)
raise

def dump_flatahb_json(self, output_directory_path: Path) -> None:
def get_flatahb_json_file_path(self, output_directory_path: Path) -> Path:
"""
Converts the unfolded AHB to a flat AHB and writes it to a json file.
The file will be stored in the directory:
'output_directory_path/<edifact_format>/flatahb/<pruefidentifikator>.json'
returns the filepath to where the flat ahb json will be dumped when using dump_flatahb_json()
raises a value error when the pruefidentifikator is not a valid one
"""
edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
if edifact_format is None:
logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
return
raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator")

flatahb_output_directory_path = output_directory_path / str(edifact_format) / "flatahb"
flatahb_output_directory_path.mkdir(parents=True, exist_ok=True)
flat_ahb = self.convert_to_flat_ahb()

file_path = flatahb_output_directory_path / f"{self.meta_data.pruefidentifikator}.json"
return file_path

def dump_flatahb_json(self, output_directory_path: Path) -> None:
"""
Converts the unfolded AHB to a flat AHB and writes it to a json file.
The file will be stored in the directory:
'output_directory_path/<edifact_format>/flatahb/<pruefidentifikator>.json'
"""
try:
file_path = self.get_flatahb_json_file_path(output_directory_path)
except ValueError:
return
flatahb_directory = file_path.parent
flatahb_directory.mkdir(parents=True, exist_ok=True)
flat_ahb = self.convert_to_flat_ahb()
if file_path.exists():
with open(file_path, "r", encoding="utf-8") as file:
existing_flat_ahb = FlatAnwendungshandbuchSchema().load(json.load(file))
Expand All @@ -394,7 +406,7 @@ def dump_flatahb_json(self, output_directory_path: Path) -> None:
logger.info(
"The flatahb file for %s is saved at %s",
self.meta_data.pruefidentifikator,
flatahb_output_directory_path / f"{self.meta_data.pruefidentifikator}.json",
file_path.absolute(),
)
del flat_ahb
del dump_data
Expand Down Expand Up @@ -424,48 +436,70 @@ def convert_to_dataframe(self) -> pd.DataFrame:
df.fillna(value="", inplace=True)
return df

def get_csv_file_path(self, output_directory_path: Path) -> Path:
"""
returns the filepath to where the CSV will be dumped when using dump_csv()
raises a value error when the pruefidentifikator is not a valid one
"""
edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
if edifact_format is None:
logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator")

csv_output_directory_path = output_directory_path / str(edifact_format) / "csv"
file_path = csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv"
return file_path

def dump_csv(self, path_to_output_directory: Path) -> None:
"""
Dump a UnfoldedAHB table into a csv file.
The file will be stored in the directory:
'path_to_output_directory/<edifact_format>/csv/<pruefidentifikator>.csv'
"""
df = self.convert_to_dataframe()
try:
csv_file_path = self.get_csv_file_path(path_to_output_directory)
except ValueError:
return
csv_output_directory_path = csv_file_path.parent
csv_output_directory_path.mkdir(parents=True, exist_ok=True)

df.to_csv(csv_file_path, encoding="utf-8")
logger.info("The csv file for %s is saved at %s", self.meta_data.pruefidentifikator, csv_file_path.absolute())
del df

def get_xlsx_file_path(self, output_directory_path: Path) -> Path:
"""
returns the filepath to where the xlsx will be dumped when using dump_xlsx()
raises a value error when the pruefidentifikator is not a valid one
"""
edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
if edifact_format is None:
logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
return

csv_output_directory_path = path_to_output_directory / str(edifact_format) / "csv"
csv_output_directory_path.mkdir(parents=True, exist_ok=True)
raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator")

df.to_csv(csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv", encoding="utf-8")
logger.info(
"The csv file for %s is saved at %s",
self.meta_data.pruefidentifikator,
csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv",
)
del df
xlsx_output_directory_path: Path = output_directory_path / str(edifact_format)
file_path = xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.xlsx"
return file_path

def dump_xlsx(self, path_to_output_directory: Path) -> None:
"""
Dump a AHB table of a given pruefi into an excel file.
The file will be stored in the directory:
'path_to_output_directory/<edifact_format>/xlsx/<pruefidentifikator>.xlsx'
"""
edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
xlsx_output_directory_path: Path = path_to_output_directory / str(edifact_format) / "xlsx"
try:
excel_file_path = self.get_xlsx_file_path(path_to_output_directory)
except ValueError:
return
xlsx_output_directory_path = excel_file_path.parent
xlsx_output_directory_path.mkdir(parents=True, exist_ok=True)

excel_file_name = f"{self.meta_data.pruefidentifikator}.xlsx"

df = self.convert_to_dataframe()

try:
# https://github.com/PyCQA/pylint/issues/3060
# pylint: disable=abstract-class-instantiated
with pd.ExcelWriter(xlsx_output_directory_path / excel_file_name, engine="xlsxwriter") as writer:
with pd.ExcelWriter(excel_file_path, engine="xlsxwriter") as writer:
df.to_excel(writer, sheet_name=f"{self.meta_data.pruefidentifikator}")
# pylint: disable=no-member
workbook = writer.book
Expand All @@ -476,12 +510,12 @@ def dump_xlsx(self, path_to_output_directory: Path) -> None:
worksheet.set_column(excel_header, column_width, wrap_format)
logger.info("💾 Saved file(s) for Pruefidentifikator %s", self.meta_data.pruefidentifikator)
except PermissionError:
logger.error("The Excel file %s is open. Please close this file and try again.", excel_file_name)
logger.error("The Excel file %s is open. Please close this file and try again.", excel_file_path)

logger.info(
"The xlsx file for %s is saved at %s",
self.meta_data.pruefidentifikator,
xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.json",
excel_file_path.absolute(),
)


Expand Down Expand Up @@ -518,3 +552,36 @@ def _remove_irrelevant_lines(lines: list[AhbLine]) -> list[AhbLine]:
if not is_double_line and not is_empty_ahb_line:
reduced_lines.append(line)
return reduced_lines


def _get_ahb(ahb_model_or_path: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path]) -> FlatAnwendungshandbuch:
"""
returns the AHB model
"""
if isinstance(ahb_model_or_path, FlatAnwendungshandbuch):
return ahb_model_or_path
if isinstance(ahb_model_or_path, UnfoldedAhb):
return ahb_model_or_path.convert_to_flat_ahb()
if isinstance(ahb_model_or_path, Path):
with open(ahb_model_or_path, "r", encoding="utf-8") as file:
return FlatAnwendungshandbuchSchema().load(json.load(file)) # type:ignore[no-any-return]
raise ValueError(
f"argument must be either a FlatAnwendungshandbuch, UnfoldedAhb or a Path but was {type(ahb_model_or_path)}"
)


def are_equal_except_for_guids(
ahb_1: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path],
ahb_2: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path],
) -> bool:
"""returns true iff both provided AHBs are equal except for/when ignoring their line guids"""
ahb1 = _get_ahb(ahb_1)
ahb2 = _get_ahb(ahb_2)
if ahb1.meta != ahb2.meta:
return False
if len(ahb1.lines) != len(ahb2.lines):
return False
for line1, line2 in zip(ahb1.lines, ahb2.lines, strict=True):
if not _lines_are_equal_when_ignoring_guid(line1, line2):
return False
return True

0 comments on commit 49119be

Please sign in to comment.