diff --git a/.github/workflows/cli_test.yml b/.github/workflows/cli_test.yml index a3c285e8..414642cb 100644 --- a/.github/workflows/cli_test.yml +++ b/.github/workflows/cli_test.yml @@ -22,4 +22,4 @@ jobs: - name: Try to install the script, then run the cli run: | pip install . - ahlbatross \ No newline at end of file + ahlbatross -i data/machine-readable_anwendungshandbuecher -o data/output diff --git a/src/ahlbatross/cli.py b/src/ahlbatross/cli.py index d305974b..b268b62d 100644 --- a/src/ahlbatross/cli.py +++ b/src/ahlbatross/cli.py @@ -5,13 +5,12 @@ import logging import sys from pathlib import Path -from typing import Optional import pandas as pd import typer from rich.console import Console -from ahlbatross.main import DEFAULT_OUTPUT_DIR, _process_submodule +from ahlbatross.main import process_ahb_data app = typer.Typer(help="ahlbatross diffs machine-readable AHBs") _logger = logging.getLogger(__name__) @@ -20,14 +19,31 @@ @app.command() -def main(output_dir: Optional[Path] = None) -> None: +def main( + input_dir: Path = typer.Option(..., "--input-dir", "-i", help="Directory containing AHB data."), + output_dir: Path = typer.Option( + ..., "--output-dir", "-o", help="Destination path to output directory containing processed files." + ), +) -> None: """ main entrypoint for AHlBatross. """ try: - _process_submodule(output_dir or DEFAULT_OUTPUT_DIR) + if not input_dir.exists(): + _logger.error("❌ Input directory does not exist: %s", input_dir.absolute()) + sys.exit(1) + process_ahb_data(input_dir, output_dir) + except FileNotFoundError as e: + _logger.error("❌ Path error: %s", str(e)) + sys.exit(1) + except PermissionError as e: + _logger.error("❌ Permission denied: %s", str(e)) + sys.exit(1) except (OSError, pd.errors.EmptyDataError, ValueError) as e: - _logger.error("❌error processing AHB files: %s", str(e)) + _logger.exception("❌ Error processing AHB files: %s", str(e)) + sys.exit(1) + except (RuntimeError, TypeError, AttributeError) as e: + _logger.exception("❌ Unexpected error: %s", str(e)) sys.exit(1) @@ -38,6 +54,7 @@ def cli() -> None: app() -# run locally using $ PYTHONPATH=src python -m ahlbatross.cli +# to run the script during local development, execute the following command: +# PYTHONPATH=src python -m ahlbatross.cli -i data/machine-readable_anwendungshandbuecher -o data/output if __name__ == "__main__": main() diff --git a/src/ahlbatross/main.py b/src/ahlbatross/main.py index b54e9ccb..22df01f8 100644 --- a/src/ahlbatross/main.py +++ b/src/ahlbatross/main.py @@ -6,7 +6,7 @@ import re import sys from pathlib import Path -from typing import Any, Tuple, TypeAlias +from typing import Any, TypeAlias import pandas as pd from pandas.core.frame import DataFrame @@ -19,26 +19,25 @@ logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stdout) logger = logging.getLogger(__name__) -SUBMODULE = Path("data/machine-readable_anwendungshandbuecher") -DEFAULT_OUTPUT_DIR = Path("data/output") - XlsxFormat: TypeAlias = Format -def _get_available_formatversions() -> list[str]: +def _is_formatversion_dir(path: Path) -> bool: """ - get all available directories in SUBMODULE, sorted from latest to oldest. + confirm if path is a formatversion directory - for instance "FV2504/". """ - if not SUBMODULE.exists(): - logger.error("❌Base directory does not exist: %s", SUBMODULE) - return [] + return path.is_dir() and path.name.startswith("FV") and len(path.name) == 6 - formatversion_dirs = [ - d.name for d in SUBMODULE.iterdir() if d.is_dir() and d.name.startswith("FV") and len(d.name) == 6 - ] - formatversion_dirs.sort(key=parse_formatversions, reverse=True) +def _get_available_formatversions(root_dir: Path) -> list[str]: + """ + get all available directories, sorted from latest to oldest. + """ + if not root_dir.exists(): + raise FileNotFoundError(f"❌ Submodule / base directory does not exist: {root_dir}") + formatversion_dirs = [d.name for d in root_dir.iterdir() if _is_formatversion_dir(d)] + formatversion_dirs.sort(key=parse_formatversions, reverse=True) return formatversion_dirs @@ -47,28 +46,27 @@ def _get_nachrichtenformat_dirs(formatversion_dir: Path) -> list[Path]: get all directories that contain a csv subdirectory. """ if not formatversion_dir.exists(): - logger.warning("❌formatversion directory not found: %s", formatversion_dir) - return [] + raise FileNotFoundError(f"❌ Formatversion directory not found: {formatversion_dir.absolute()}") return [d for d in formatversion_dir.iterdir() if d.is_dir() and (d / "csv").exists() and (d / "csv").is_dir()] -def _is_formatversion_dir_empty(formatversion: str) -> bool: +def _is_formatversion_dir_empty(root_dir: Path, formatversion: str) -> bool: """ check if a directory does not contain any directories. """ - formatversion_dir = SUBMODULE / formatversion + formatversion_dir = root_dir / formatversion if not formatversion_dir.exists(): return True return len(_get_nachrichtenformat_dirs(formatversion_dir)) == 0 -def determine_consecutive_formatversions() -> list[Tuple[str, str]]: +def determine_consecutive_formatversions(root_dir: Path) -> list[tuple[str, str]]: """ generate pairs of consecutive directories to compare and skip empty directories. """ - formatversion_list = _get_available_formatversions() + formatversion_list = _get_available_formatversions(root_dir) consecutive_formatversions = [] for i in range(len(formatversion_list) - 1): @@ -76,9 +74,11 @@ def determine_consecutive_formatversions() -> list[Tuple[str, str]]: previous_formatversion = formatversion_list[i + 1] # skip if either directory is empty. - if _is_formatversion_dir_empty(subsequent_formatversion) or _is_formatversion_dir_empty(previous_formatversion): + if _is_formatversion_dir_empty(root_dir, subsequent_formatversion) or _is_formatversion_dir_empty( + root_dir, previous_formatversion + ): logger.warning( - "⚠️skipping empty consecutive formatversions: %s -> %s", + "❗️Skipping empty consecutive formatversions: %s -> %s", subsequent_formatversion, previous_formatversion, ) @@ -91,16 +91,16 @@ def determine_consecutive_formatversions() -> list[Tuple[str, str]]: # pylint:disable=too-many-locals def get_matching_pruefid_files( - previous_formatversion: str, subsequent_formatversion: str + root_dir: Path, previous_formatversion: str, subsequent_formatversion: str ) -> list[tuple[Path, Path, str, str]]: """ find matching ahb/.csv files across and directories. """ - previous_formatversion_dir = SUBMODULE / previous_formatversion - subsequent_formatversion_dir = SUBMODULE / subsequent_formatversion + previous_formatversion_dir = root_dir / previous_formatversion + subsequent_formatversion_dir = root_dir / subsequent_formatversion if not all(d.exists() for d in [previous_formatversion_dir, subsequent_formatversion_dir]): - logger.error("❌at least one formatversion directory does not exist.") + logger.error("❌ At least one formatversion directory does not exist.") return [] matching_files = [] @@ -417,12 +417,12 @@ def align_columns( def _process_files( - previous_formatversion: str, subsequent_formatversion: str, output_dir: Path = DEFAULT_OUTPUT_DIR + root_dir: Path, previous_formatversion: str, subsequent_formatversion: str, output_dir: Path ) -> None: """ process all matching ahb/.csv files between two directories. """ - matching_files = get_matching_pruefid_files(previous_formatversion, subsequent_formatversion) + matching_files = get_matching_pruefid_files(root_dir, previous_formatversion, subsequent_formatversion) if not matching_files: logger.warning("No matching files found to compare") @@ -453,25 +453,27 @@ def _process_files( merged_df.to_csv(csv_path, index=False) export_to_excel(merged_df, str(xlsx_path)) - logger.info("✅successfully processed %s/%s", nachrichtentyp, pruefid) + logger.info("✅ Successfully processed %s/%s", nachrichtentyp, pruefid) except pd.errors.EmptyDataError: - logger.error("❌empty or corrupted CSV file for %s/%s", nachrichtentyp, pruefid) + logger.error("❌ Empty or corrupted CSV file for %s/%s", nachrichtentyp, pruefid) except OSError as e: - logger.error("❌file system error for %s/%s: %s", nachrichtentyp, pruefid, str(e)) + logger.error("❌ File system error for %s/%s: %s", nachrichtentyp, pruefid, str(e)) except ValueError as e: - logger.error("❌data processing error for %s/%s: %s", nachrichtentyp, pruefid, str(e)) + logger.error("❌ Data processing error for %s/%s: %s", nachrichtentyp, pruefid, str(e)) -def _process_submodule(output_dir: Path = DEFAULT_OUTPUT_DIR) -> None: +def process_ahb_data(input_dir: Path, output_dir: Path) -> None: """ processes all valid consecutive subdirectories. """ + logger.info("Found AHB root directory at: %s", input_dir.absolute()) logger.info("The output dir is %s", output_dir.absolute()) - consecutive_formatversions = determine_consecutive_formatversions() + + consecutive_formatversions = determine_consecutive_formatversions(input_dir) if not consecutive_formatversions: - logger.warning("⚠️no valid consecutive formatversion subdirectories found to compare") + logger.warning("❗️ No valid consecutive formatversion subdirectories found to compare.") return for subsequent_formatversion, previous_formatversion in consecutive_formatversions: @@ -479,10 +481,15 @@ def _process_submodule(output_dir: Path = DEFAULT_OUTPUT_DIR) -> None: "⌛processing consecutive formatversions: %s -> %s", subsequent_formatversion, previous_formatversion ) try: - _process_files(previous_formatversion, subsequent_formatversion, output_dir) + _process_files( + root_dir=input_dir, + previous_formatversion=previous_formatversion, + subsequent_formatversion=subsequent_formatversion, + output_dir=output_dir, + ) except (OSError, pd.errors.EmptyDataError, ValueError) as e: logger.error( - "❌error processing formatversions %s -> %s: %s", + "❌ Error processing formatversions %s -> %s: %s", subsequent_formatversion, previous_formatversion, str(e), diff --git a/unittests/test_export.py b/unittests/test_export.py index 7cf2e5c0..ab48335c 100644 --- a/unittests/test_export.py +++ b/unittests/test_export.py @@ -1,10 +1,14 @@ +import logging import os import tempfile from pathlib import Path import pandas as pd +import pytest from pandas.testing import assert_frame_equal +from typer.testing import CliRunner +from ahlbatross.cli import app from ahlbatross.excel import export_to_excel from ahlbatross.main import align_columns @@ -62,3 +66,29 @@ def test_empty_dataframe_export() -> None: assert os.path.exists(xlsx_path) assert os.path.getsize(xlsx_path) > 0 + + +def test_cli_with_custom_output_directory(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """ + test CLI handling of custom --output-dir. + """ + caplog.set_level(logging.INFO) + + input_dir = tmp_path / "input" + input_dir.mkdir() + fv_dir = input_dir / "FV2504" / "Nachrichtenformat_1" + fv_dir.mkdir(parents=True) + csv_dir = fv_dir / "csv" + csv_dir.mkdir() + (csv_dir / "test.csv").write_text("test data") + + output_dir = tmp_path / "custom_output" + output_dir.mkdir() + + runner = CliRunner() + result = runner.invoke( + app, ["--input-dir", str(input_dir), "--output-dir", str(output_dir)], catch_exceptions=False + ) + + assert result.exit_code == 0 + assert str(output_dir.absolute()) in caplog.text diff --git a/unittests/test_parse_submodule.py b/unittests/test_parse_ahb_data.py similarity index 63% rename from unittests/test_parse_submodule.py rename to unittests/test_parse_ahb_data.py index fef44821..698c5a1b 100644 --- a/unittests/test_parse_submodule.py +++ b/unittests/test_parse_ahb_data.py @@ -1,8 +1,10 @@ +import logging from pathlib import Path import pytest -from _pytest.monkeypatch import MonkeyPatch +from typer.testing import CliRunner +from ahlbatross.cli import app from ahlbatross.format_version_helpers import parse_formatversions from ahlbatross.main import determine_consecutive_formatversions, get_matching_pruefid_files @@ -34,11 +36,10 @@ def test_parse_invalid_formatversions() -> None: parse_formatversions(invalid_input) -def test_get_matching_files(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: +def test_get_matching_files(tmp_path: Path) -> None: """ test find matching files across formatversions. """ - monkeypatch.setattr("ahlbatross.main.SUBMODULE", tmp_path) submodule: dict[str, dict[str, dict[str, str]]] = { "FV2504": { @@ -60,7 +61,9 @@ def test_get_matching_files(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: for file, content in files.items(): (nachrichtenformat_dir / file).write_text(content) - matches = get_matching_pruefid_files("FV2410", "FV2504") + matches = get_matching_pruefid_files( + root_dir=tmp_path, previous_formatversion="FV2410", subsequent_formatversion="FV2504" + ) assert len(matches) == 2 assert matches[0][2] == "nachrichtenformat_1" @@ -68,12 +71,11 @@ def test_get_matching_files(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: assert matches[1][3] == "pruefid_2" -def test_determine_consecutive_formatversions(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: +def test_determine_consecutive_formatversions(tmp_path: Path) -> None: """ test successful determination of consecutive formatversions. """ - monkeypatch.setattr("ahlbatross.main.SUBMODULE", tmp_path) - + # Create test directory structure with formatversions and add dummy file submodule: dict[str, dict[str, bool | dict[str, str]]] = { "FV2504": {"nachrichtenformat_1": True}, "FV2410": {"nachrichtenformat_1": True}, @@ -92,5 +94,42 @@ def test_determine_consecutive_formatversions(tmp_path: Path, monkeypatch: Monke csv_dir.mkdir() (csv_dir / "test.csv").write_text("test") - result = determine_consecutive_formatversions() + result = determine_consecutive_formatversions(root_dir=tmp_path) assert result == [("FV2504", "FV2410")] + + +def test_cli_with_custom_input_directory(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """ + test CLI handling of custom --input-dir. + """ + caplog.set_level(logging.INFO) + + input_dir = tmp_path / "custom_input" + input_dir.mkdir() + fv_dir = input_dir / "FV2504" / "Nachrichtenformat_1" + fv_dir.mkdir(parents=True) + csv_dir = fv_dir / "csv" + csv_dir.mkdir() + (csv_dir / "test.csv").write_text("test data") + + runner = CliRunner() + result = runner.invoke(app, ["--input-dir", str(input_dir), "--output-dir", str(tmp_path)], catch_exceptions=False) + + assert result.exit_code == 0 + assert "No valid consecutive formatversion subdirectories found to compare." in caplog.text + + +def test_cli_with_invalid_input_directory(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """ + test CLI handling of invalid --input-dir. + """ + caplog.set_level(logging.INFO) + invalid_dir = tmp_path / "does_not_exist" + runner = CliRunner() + result = runner.invoke( + app, ["--input-dir", str(invalid_dir), "--output-dir", str(tmp_path)], catch_exceptions=False + ) + + assert "❌ Input directory does not exist:" in caplog.text + assert str(invalid_dir) in caplog.text + assert result.exit_code == 1