Avoid unnecessary write operations: Dont' dump if JSON output didn't …

…changed (#384) * Avoid unnecessary write operations: Dont' dump if nothing changed the current behaviour is especially annoying in the machine-readable_anwendungshandbuch repository where we get lots of noise in the excel files see e.g. Hochfrequenz/machine-readable_anwendungshandbuecher#216 or Hochfrequenz/machine-readable_anwendungshandbuecher#217 * shut up mypy * shutup 2 * Update src/kohlrahbi/ahb/__init__.py * reduce redundancy in logging
Hochfrequenz · Jul 30, 2024 · 49119be · 49119be
1 parent 88b596f
commit 49119be
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 32 deletions.
diff --git a/src/kohlrahbi/ahb/__init__.py b/src/kohlrahbi/ahb/__init__.py
@@ -27,6 +27,7 @@
 )
 from kohlrahbi.seed import Seed
 from kohlrahbi.unfoldedahb import UnfoldedAhb
+from kohlrahbi.unfoldedahb.unfoldedahbtable import are_equal_except_for_guids
 
 _pruefi_pattern = re.compile(r"^[1-9]\d{4}$")
 
@@ -55,12 +56,29 @@ def process_ahb_table(
     """
     unfolded_ahb = UnfoldedAhb.from_ahb_table(ahb_table=ahb_table, pruefi=pruefi)
 
+    try:
+        json_file_path = unfolded_ahb.get_flatahb_json_file_path(output_path)
+        excel_file_path = unfolded_ahb.get_xlsx_file_path(output_path)
+        csv_file_path = unfolded_ahb.get_csv_file_path(output_path)
+    except ValueError:
+        logger.warning("Error while determining file paths for pruefi '%s'. Skipping saving files.", pruefi)
+        return
+    pruefi_didnt_change_since_last_scraping: Optional[bool] = None
+    if AhbExportFileFormat.FLATAHB in file_type:
+        # the flat ahb ist the only file format from which we can READ to compare our current with previous results
+        if json_file_path.exists():
+            pruefi_didnt_change_since_last_scraping = are_equal_except_for_guids(unfolded_ahb, json_file_path)
+    # ⚠ here we assume that the csv/json/xlsx files are in sync, if they exist.
+    # this means: if the json file didn't change and a csv file exists, we expect the csv file to also be unchanged
     if AhbExportFileFormat.XLSX in file_type:
-        unfolded_ahb.dump_xlsx(output_path)
+        if not excel_file_path.exists() or not pruefi_didnt_change_since_last_scraping:
+            unfolded_ahb.dump_xlsx(output_path)
     if AhbExportFileFormat.FLATAHB in file_type:
-        unfolded_ahb.dump_flatahb_json(output_path)
+        if not json_file_path.exists() or not pruefi_didnt_change_since_last_scraping:
+            unfolded_ahb.dump_flatahb_json(output_path)
     if AhbExportFileFormat.CSV in file_type:
-        unfolded_ahb.dump_csv(output_path)
+        if not csv_file_path.exists() or not pruefi_didnt_change_since_last_scraping:
+            unfolded_ahb.dump_csv(output_path)
     del unfolded_ahb
 
 
@@ -103,7 +121,7 @@ def process_pruefi(
     """
     Process one pruefi.
     If the input path ends with .docx, we assume that the file containing the pruefi is given.
-    Therefore we only access that file.
+    Therefore, we only access that file.
     """
 
     # TODO try to cache document objects cause it is slow to read them from disk # pylint: disable=fixme

diff --git a/src/kohlrahbi/unfoldedahb/unfoldedahbtable.py b/src/kohlrahbi/unfoldedahb/unfoldedahbtable.py
@@ -7,6 +7,7 @@
 import re
 from functools import lru_cache
 from pathlib import Path
+from typing import Union
 from uuid import uuid4
 
 import attrs
@@ -368,22 +369,33 @@ def convert_to_flat_ahb(self) -> FlatAnwendungshandbuch:
             )
             raise
 
-    def dump_flatahb_json(self, output_directory_path: Path) -> None:
+    def get_flatahb_json_file_path(self, output_directory_path: Path) -> Path:
         """
-        Converts the unfolded AHB to a flat AHB and writes it to a json file.
-        The file will be stored in the directory:
-            'output_directory_path/<edifact_format>/flatahb/<pruefidentifikator>.json'
+        returns the filepath to where the flat ahb json will be dumped when using dump_flatahb_json()
+        raises a value error when the pruefidentifikator is not a valid one
         """
         edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
         if edifact_format is None:
             logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
-            return
+            raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator")
 
         flatahb_output_directory_path = output_directory_path / str(edifact_format) / "flatahb"
-        flatahb_output_directory_path.mkdir(parents=True, exist_ok=True)
-        flat_ahb = self.convert_to_flat_ahb()
-
         file_path = flatahb_output_directory_path / f"{self.meta_data.pruefidentifikator}.json"
+        return file_path
+
+    def dump_flatahb_json(self, output_directory_path: Path) -> None:
+        """
+        Converts the unfolded AHB to a flat AHB and writes it to a json file.
+        The file will be stored in the directory:
+            'output_directory_path/<edifact_format>/flatahb/<pruefidentifikator>.json'
+        """
+        try:
+            file_path = self.get_flatahb_json_file_path(output_directory_path)
+        except ValueError:
+            return
+        flatahb_directory = file_path.parent
+        flatahb_directory.mkdir(parents=True, exist_ok=True)
+        flat_ahb = self.convert_to_flat_ahb()
         if file_path.exists():
             with open(file_path, "r", encoding="utf-8") as file:
                 existing_flat_ahb = FlatAnwendungshandbuchSchema().load(json.load(file))
@@ -394,7 +406,7 @@ def dump_flatahb_json(self, output_directory_path: Path) -> None:
         logger.info(
             "The flatahb file for %s is saved at %s",
             self.meta_data.pruefidentifikator,
-            flatahb_output_directory_path / f"{self.meta_data.pruefidentifikator}.json",
+            file_path.absolute(),
         )
         del flat_ahb
         del dump_data
@@ -424,48 +436,70 @@ def convert_to_dataframe(self) -> pd.DataFrame:
         df.fillna(value="", inplace=True)
         return df
 
+    def get_csv_file_path(self, output_directory_path: Path) -> Path:
+        """
+        returns the filepath to where the CSV will be dumped when using dump_csv()
+        raises a value error when the pruefidentifikator is not a valid one
+        """
+        edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
+        if edifact_format is None:
+            logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
+            raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator")
+
+        csv_output_directory_path = output_directory_path / str(edifact_format) / "csv"
+        file_path = csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv"
+        return file_path
+
     def dump_csv(self, path_to_output_directory: Path) -> None:
         """
         Dump a UnfoldedAHB table into a csv file.
         The file will be stored in the directory:
             'path_to_output_directory/<edifact_format>/csv/<pruefidentifikator>.csv'
         """
         df = self.convert_to_dataframe()
+        try:
+            csv_file_path = self.get_csv_file_path(path_to_output_directory)
+        except ValueError:
+            return
+        csv_output_directory_path = csv_file_path.parent
+        csv_output_directory_path.mkdir(parents=True, exist_ok=True)
 
+        df.to_csv(csv_file_path, encoding="utf-8")
+        logger.info("The csv file for %s is saved at %s", self.meta_data.pruefidentifikator, csv_file_path.absolute())
+        del df
+
+    def get_xlsx_file_path(self, output_directory_path: Path) -> Path:
+        """
+        returns the filepath to where the xlsx will be dumped when using dump_xlsx()
+        raises a value error when the pruefidentifikator is not a valid one
+        """
         edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
         if edifact_format is None:
             logger.warning("'%s' is not a pruefidentifikator", self.meta_data.pruefidentifikator)
-            return
-
-        csv_output_directory_path = path_to_output_directory / str(edifact_format) / "csv"
-        csv_output_directory_path.mkdir(parents=True, exist_ok=True)
+            raise ValueError(f"'{self.meta_data.pruefidentifikator}' is not a pruefidentifikator")
 
-        df.to_csv(csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv", encoding="utf-8")
-        logger.info(
-            "The csv file for %s is saved at %s",
-            self.meta_data.pruefidentifikator,
-            csv_output_directory_path / f"{self.meta_data.pruefidentifikator}.csv",
-        )
-        del df
+        xlsx_output_directory_path: Path = output_directory_path / str(edifact_format)
+        file_path = xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.xlsx"
+        return file_path
 
     def dump_xlsx(self, path_to_output_directory: Path) -> None:
         """
         Dump a AHB table of a given pruefi into an excel file.
         The file will be stored in the directory:
             'path_to_output_directory/<edifact_format>/xlsx/<pruefidentifikator>.xlsx'
         """
-        edifact_format = get_format_of_pruefidentifikator(self.meta_data.pruefidentifikator)
-        xlsx_output_directory_path: Path = path_to_output_directory / str(edifact_format) / "xlsx"
+        try:
+            excel_file_path = self.get_xlsx_file_path(path_to_output_directory)
+        except ValueError:
+            return
+        xlsx_output_directory_path = excel_file_path.parent
         xlsx_output_directory_path.mkdir(parents=True, exist_ok=True)
 
-        excel_file_name = f"{self.meta_data.pruefidentifikator}.xlsx"
-
         df = self.convert_to_dataframe()
-
         try:
             # https://github.com/PyCQA/pylint/issues/3060
             # pylint: disable=abstract-class-instantiated
-            with pd.ExcelWriter(xlsx_output_directory_path / excel_file_name, engine="xlsxwriter") as writer:
+            with pd.ExcelWriter(excel_file_path, engine="xlsxwriter") as writer:
                 df.to_excel(writer, sheet_name=f"{self.meta_data.pruefidentifikator}")
                 # pylint: disable=no-member
                 workbook = writer.book
@@ -476,12 +510,12 @@ def dump_xlsx(self, path_to_output_directory: Path) -> None:
                     worksheet.set_column(excel_header, column_width, wrap_format)
                 logger.info("💾 Saved file(s) for Pruefidentifikator %s", self.meta_data.pruefidentifikator)
         except PermissionError:
-            logger.error("The Excel file %s is open. Please close this file and try again.", excel_file_name)
+            logger.error("The Excel file %s is open. Please close this file and try again.", excel_file_path)
 
         logger.info(
             "The xlsx file for %s is saved at %s",
             self.meta_data.pruefidentifikator,
-            xlsx_output_directory_path / f"{self.meta_data.pruefidentifikator}.json",
+            excel_file_path.absolute(),
         )
 
 
@@ -518,3 +552,36 @@ def _remove_irrelevant_lines(lines: list[AhbLine]) -> list[AhbLine]:
         if not is_double_line and not is_empty_ahb_line:
             reduced_lines.append(line)
     return reduced_lines
+
+
+def _get_ahb(ahb_model_or_path: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path]) -> FlatAnwendungshandbuch:
+    """
+    returns the AHB model
+    """
+    if isinstance(ahb_model_or_path, FlatAnwendungshandbuch):
+        return ahb_model_or_path
+    if isinstance(ahb_model_or_path, UnfoldedAhb):
+        return ahb_model_or_path.convert_to_flat_ahb()
+    if isinstance(ahb_model_or_path, Path):
+        with open(ahb_model_or_path, "r", encoding="utf-8") as file:
+            return FlatAnwendungshandbuchSchema().load(json.load(file))  # type:ignore[no-any-return]
+    raise ValueError(
+        f"argument must be either a FlatAnwendungshandbuch, UnfoldedAhb or a Path but was {type(ahb_model_or_path)}"
+    )
+
+
+def are_equal_except_for_guids(
+    ahb_1: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path],
+    ahb_2: Union[FlatAnwendungshandbuch, UnfoldedAhb, Path],
+) -> bool:
+    """returns true iff both provided AHBs are equal except for/when ignoring their line guids"""
+    ahb1 = _get_ahb(ahb_1)
+    ahb2 = _get_ahb(ahb_2)
+    if ahb1.meta != ahb2.meta:
+        return False
+    if len(ahb1.lines) != len(ahb2.lines):
+        return False
+    for line1, line2 in zip(ahb1.lines, ahb2.lines, strict=True):
+        if not _lines_are_equal_when_ignoring_guid(line1, line2):
+            return False
+    return True