From 49119bed275e40d99530ef1392ad11b8f96f0799 Mon Sep 17 00:00:00 2001 From: konstantin Date: Tue, 30 Jul 2024 15:52:22 +0200 Subject: [PATCH] Avoid unnecessary write operations: Dont' dump if JSON output didn't changed (#384) * Avoid unnecessary write operations: Dont' dump if nothing changed the current behaviour is especially annoying in the machine-readable_anwendungshandbuch repository where we get lots of noise in the excel files see e.g. https://github.com/Hochfrequenz/machine-readable_anwendungshandbuecher/pull/216 or https://github.com/Hochfrequenz/machine-readable_anwendungshandbuecher/pull/217 * shut up mypy * shutup 2 * Update src/kohlrahbi/ahb/__init__.py * reduce redundancy in logging --- src/kohlrahbi/ahb/__init__.py | 26 +++- src/kohlrahbi/unfoldedahb/unfoldedahbtable.py | 123 ++++++++++++++---- 2 files changed, 117 insertions(+), 32 deletions(-) diff --git a/src/kohlrahbi/ahb/__init__.py b/src/kohlrahbi/ahb/__init__.py index fd2c6c03..a4275488 100644 --- a/src/kohlrahbi/ahb/__init__.py +++ b/src/kohlrahbi/ahb/__init__.py @@ -27,6 +27,7 @@ ) from kohlrahbi.seed import Seed from kohlrahbi.unfoldedahb import UnfoldedAhb +from kohlrahbi.unfoldedahb.unfoldedahbtable import are_equal_except_for_guids _pruefi_pattern = re.compile(r"^[1-9]\d{4}$") @@ -55,12 +56,29 @@ def process_ahb_table( """ unfolded_ahb = UnfoldedAhb.from_ahb_table(ahb_table=ahb_table, pruefi=pruefi) + try: + json_file_path = unfolded_ahb.get_flatahb_json_file_path(output_path) + excel_file_path = unfolded_ahb.get_xlsx_file_path(output_path) + csv_file_path = unfolded_ahb.get_csv_file_path(output_path) + except ValueError: + logger.warning("Error while determining file paths for pruefi '%s'. Skipping saving files.", pruefi) + return + pruefi_didnt_change_since_last_scraping: Optional[bool] = None + if AhbExportFileFormat.FLATAHB in file_type: + # the flat ahb ist the only file format from which we can READ to compare our current with previous results + if json_file_path.exists(): + pruefi_didnt_change_since_last_scraping = are_equal_except_for_guids(unfolded_ahb, json_file_path) + # ⚠ here we assume that the csv/json/xlsx files are in sync, if they exist. + # this means: if the json file didn't change and a csv file exists, we expect the csv file to also be unchanged if AhbExportFileFormat.XLSX in file_type: - unfolded_ahb.dump_xlsx(output_path) + if not excel_file_path.exists() or not pruefi_didnt_change_since_last_scraping: + unfolded_ahb.dump_xlsx(output_path) if AhbExportFileFormat.FLATAHB in file_type: - unfolded_ahb.dump_flatahb_json(output_path) + if not json_file_path.exists() or not pruefi_didnt_change_since_last_scraping: + unfolded_ahb.dump_flatahb_json(output_path) if AhbExportFileFormat.CSV in file_type: - unfolded_ahb.dump_csv(output_path) + if not csv_file_path.exists() or not pruefi_didnt_change_since_last_scraping: + unfolded_ahb.dump_csv(output_path) del unfolded_ahb @@ -103,7 +121,7 @@ def process_pruefi( """ Process one pruefi. If the input path ends with .docx, we assume that the file containing the pruefi is given. - Therefore we only access that file. + Therefore, we only access that file. """ # TODO try to cache document objects cause it is slow to read them from disk # pylint: disable=fixme diff --git a/src/kohlrahbi/unfoldedahb/unfoldedahbtable.py b/src/kohlrahbi/unfoldedahb/unfoldedahbtable.py index f2a4ef55..6b143845 100644 --- a/src/kohlrahbi/unfoldedahb/unfoldedahbtable.py +++ b/src/kohlrahbi/unfoldedahb/unfoldedahbtable.py @@ -7,6 +7,7 @@ import re from functools import lru_cache from pathlib import Path +from typing import Union from uuid import uuid4 import attrs @@ -368,22 +369,33 @@ def convert_to_flat_ahb(self) -> FlatAnwendungshandbuch: ) raise - def dump_flatahb_json(self, output_directory_path: Path) -> None: + def get_flatahb_json_file_path(self, output_directory_path: Path) -> Path: """ - Converts the unfolded AHB to a flat AHB and writes it to a json file. - The file will be stored in the directory: - 'output_directory_path//flatahb/.json' + returns the filepath to where the flat ahb json will be dumped when using dump_flatahb_json() + raises a value error when the pruefidentifikator is not a valid one """ edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator) if edifact_format is None: logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator) - return + raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator") flatahb_output_directory_path = output_directory_path / str(edifact_format) / "flatahb" - flatahb_output_directory_path.mkdir(parents=True, exist_ok=True) - flat_ahb = self.convert_to_flat_ahb() - file_path = flatahb_output_directory_path / f"{self.meta_data.pruefidentifikator}.json" + return file_path + + def dump_flatahb_json(self, output_directory_path: Path) -> None: + """ + Converts the unfolded AHB to a flat AHB and writes it to a json file. + The file will be stored in the directory: + 'output_directory_path//flatahb/.json' + """ + try: + file_path = self.get_flatahb_json_file_path(output_directory_path) + except ValueError: + return + flatahb_directory = file_path.parent + flatahb_directory.mkdir(parents=True, exist_ok=True) + flat_ahb = self.convert_to_flat_ahb() if file_path.exists(): with open(file_path, "r", encoding="utf-8") as file: existing_flat_ahb = FlatAnwendungshandbuchSchema().load(json.load(file)) @@ -394,7 +406,7 @@ def dump_flatahb_json(self, output_directory_path: Path) -> None: logger.info( "The flatahb file for %s is saved at %s", self.meta_data.pruefidentifikator, - flatahb_output_directory_path / f"{self.meta_data.pruefidentifikator}.json", + file_path.absolute(), ) del flat_ahb del dump_data @@ -424,6 +436,20 @@ def convert_to_dataframe(self) -> pd.DataFrame: df.fillna(value="", inplace=True) return df + def get_csv_file_path(self, output_directory_path: Path) -> Path: + """ + returns the filepath to where the CSV will be dumped when using dump_csv() + raises a value error when the pruefidentifikator is not a valid one + """ + edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator) + if edifact_format is None: + logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator) + raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator") + + csv_output_directory_path = output_directory_path / str(edifact_format) / "csv" + file_path = csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv" + return file_path + def dump_csv(self, path_to_output_directory: Path) -> None: """ Dump a UnfoldedAHB table into a csv file. @@ -431,22 +457,30 @@ def dump_csv(self, path_to_output_directory: Path) -> None: 'path_to_output_directory//csv/.csv' """ df = self.convert_to_dataframe() + try: + csv_file_path = self.get_csv_file_path(path_to_output_directory) + except ValueError: + return + csv_output_directory_path = csv_file_path.parent + csv_output_directory_path.mkdir(parents=True, exist_ok=True) + df.to_csv(csv_file_path, encoding="utf-8") + logger.info("The csv file for %s is saved at %s", self.meta_data.pruefidentifikator, csv_file_path.absolute()) + del df + + def get_xlsx_file_path(self, output_directory_path: Path) -> Path: + """ + returns the filepath to where the xlsx will be dumped when using dump_xlsx() + raises a value error when the pruefidentifikator is not a valid one + """ edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator) if edifact_format is None: logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator) - return - - csv_output_directory_path = path_to_output_directory / str(edifact_format) / "csv" - csv_output_directory_path.mkdir(parents=True, exist_ok=True) + raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator") - df.to_csv(csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv", encoding="utf-8") - logger.info( - "The csv file for %s is saved at %s", - self.meta_data.pruefidentifikator, - csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv", - ) - del df + xlsx_output_directory_path: Path = output_directory_path / str(edifact_format) + file_path = xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.xlsx" + return file_path def dump_xlsx(self, path_to_output_directory: Path) -> None: """ @@ -454,18 +488,18 @@ def dump_xlsx(self, path_to_output_directory: Path) -> None: The file will be stored in the directory: 'path_to_output_directory//xlsx/.xlsx' """ - edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator) - xlsx_output_directory_path: Path = path_to_output_directory / str(edifact_format) / "xlsx" + try: + excel_file_path = self.get_xlsx_file_path(path_to_output_directory) + except ValueError: + return + xlsx_output_directory_path = excel_file_path.parent xlsx_output_directory_path.mkdir(parents=True, exist_ok=True) - excel_file_name = f"{self.meta_data.pruefidentifikator}.xlsx" - df = self.convert_to_dataframe() - try: # https://github.com/PyCQA/pylint/issues/3060 # pylint: disable=abstract-class-instantiated - with pd.ExcelWriter(xlsx_output_directory_path / excel_file_name, engine="xlsxwriter") as writer: + with pd.ExcelWriter(excel_file_path, engine="xlsxwriter") as writer: df.to_excel(writer, sheet_name=f"{self.meta_data.pruefidentifikator}") # pylint: disable=no-member workbook = writer.book @@ -476,12 +510,12 @@ def dump_xlsx(self, path_to_output_directory: Path) -> None: worksheet.set_column(excel_header, column_width, wrap_format) logger.info("💾 Saved file(s) for Pruefidentifikator %s", self.meta_data.pruefidentifikator) except PermissionError: - logger.error("The Excel file %s is open. Please close this file and try again.", excel_file_name) + logger.error("The Excel file %s is open. Please close this file and try again.", excel_file_path) logger.info( "The xlsx file for %s is saved at %s", self.meta_data.pruefidentifikator, - xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.json", + excel_file_path.absolute(), ) @@ -518,3 +552,36 @@ def _remove_irrelevant_lines(lines: list[AhbLine]) -> list[AhbLine]: if not is_double_line and not is_empty_ahb_line: reduced_lines.append(line) return reduced_lines + + +def _get_ahb(ahb_model_or_path: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path]) -> FlatAnwendungshandbuch: + """ + returns the AHB model + """ + if isinstance(ahb_model_or_path, FlatAnwendungshandbuch): + return ahb_model_or_path + if isinstance(ahb_model_or_path, UnfoldedAhb): + return ahb_model_or_path.convert_to_flat_ahb() + if isinstance(ahb_model_or_path, Path): + with open(ahb_model_or_path, "r", encoding="utf-8") as file: + return FlatAnwendungshandbuchSchema().load(json.load(file)) # type:ignore[no-any-return] + raise ValueError( + f"argument must be either a FlatAnwendungshandbuch, UnfoldedAhb or a Path but was {type(ahb_model_or_path)}" + ) + + +def are_equal_except_for_guids( + ahb_1: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path], + ahb_2: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path], +) -> bool: + """returns true iff both provided AHBs are equal except for/when ignoring their line guids""" + ahb1 = _get_ahb(ahb_1) + ahb2 = _get_ahb(ahb_2) + if ahb1.meta != ahb2.meta: + return False + if len(ahb1.lines) != len(ahb2.lines): + return False + for line1, line2 in zip(ahb1.lines, ahb2.lines, strict=True): + if not _lines_are_equal_when_ignoring_guid(line1, line2): + return False + return True