DAS-NONE: Move trajectory subsetter tests to shared_utils (#104)

nasa · Oct 15, 2024 · e23ee4b · e23ee4b
1 parent 45ba2f4
commit e23ee4b
Show file tree

Hide file tree

Showing 14 changed files with 171 additions and 227 deletions.
diff --git a/.github/workflows/build-all-images.yml b/.github/workflows/build-all-images.yml
@@ -43,6 +43,7 @@ jobs:
           -
             image: "trajectory-subsetter"
             notebook: "TrajectorySubsetter_Regression.ipynb"
+            shared-utils: "true"
           -
             image: "variable-subsetter"
             notebook: "VariableSubsetter_Regression.ipynb"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ versioning. Rather than a static releases, this repository contains of a number
 of regression tests that are each semi-independent.  This CHANGELOG file should be used
 to document pull requests to this repository.
 
+## 2024-10-11 ([#104](https://github.com/nasa/harmony-regression-tests/pull/104))
+
+- Migrates trajectory-subsetter to use `shared_utils`.
+- Separates `shared_utils/utilities.py` into `utilities.py` and `compare.py` preventing `xarray` from being a mandatory requirement to use `shared_utils`.
+- Updates `shared_utils` `README` to mention the github action updates needed to use `shared_utils`.
+- Removes old `compare_results_to_reference_file` and renames `compare_results_to_reference_file_new` -> `compare_results_to_reference_file`
+- Migrates nsidc_icesat2 tests to the new `shared_utils` structure and names.
+
 ## 2024-10-11 ([#103](https://github.com/nasa/harmony-regression-tests/pull/103))
 
 - Update the ATL03 and ATL08 reference files in the `nsidc-icesat2` regression

diff --git a/README.md b/README.md
@@ -222,7 +222,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python=3.7
+  - python=3.11
   - jupyter
   - requests
   - netcdf4

diff --git a/test/Makefile b/test/Makefile
@@ -36,7 +36,7 @@ swath-projector-image: Dockerfile swath-projector/environment.yaml
 
 trajectory-subsetter-image: Dockerfile trajectory-subsetter/environment.yaml
 	docker build -t ghcr.io/nasa/regression-tests-trajectory-subsetter:latest -f ./Dockerfile \
-	--build-arg notebook=TrajectorySubsetter_Regression.ipynb --build-arg sub_dir=trajectory-subsetter .
+	--build-arg notebook=TrajectorySubsetter_Regression.ipynb --build-arg sub_dir=trajectory-subsetter --build-arg shared_utils=true .
 
 variable-subsetter-image: Dockerfile variable-subsetter/environment.yaml
 	docker build -t ghcr.io/nasa/regression-tests-variable-subsetter:latest -f ./Dockerfile \

diff --git a/test/nsidc-icesat2/NSIDC-ICESAT2_Regression.ipynb b/test/nsidc-icesat2/NSIDC-ICESAT2_Regression.ipynb
@@ -121,11 +121,8 @@
     "import sys\n",
     "\n",
     "sys.path.append('../shared_utils')\n",
-    "from utilities import (\n",
-    "    print_success,\n",
-    "    submit_and_download,\n",
-    "    compare_results_to_reference_file_new,\n",
-    ")"
+    "from utilities import print_success, submit_and_download\n",
+    "from compare import compare_results_to_reference_file"
    ]
   },
   {
@@ -384,7 +381,7 @@
     "            assert exists(\n",
     "                test_output\n",
     "            ), 'Unsuccessful Harmony Request: {shortname}: {test_name}'\n",
-    "            compare_results_to_reference_file_new(\n",
+    "            compare_results_to_reference_file(\n",
     "                test_output, test_reference, identical=False\n",
     "            )\n",
     "            print_success(f'{shortname} {test_name} test request.')\n",
@@ -438,7 +435,7 @@
     "            assert exists(\n",
     "                test_output\n",
     "            ), 'Unsuccessful Harmony Request: {shortname}: {test_name}'\n",
-    "            compare_results_to_reference_file_new(\n",
+    "            compare_results_to_reference_file(\n",
     "                test_output,\n",
     "                test_reference,\n",
     "                identical=False,\n",
@@ -495,7 +492,7 @@
     "            assert exists(\n",
     "                test_output\n",
     "            ), 'Unsuccessful Harmony Request: {shortname}: {test_name}'\n",
-    "            compare_results_to_reference_file_new(\n",
+    "            compare_results_to_reference_file(\n",
     "                test_output, test_reference, identical=False, coordinates_to_fix=[]\n",
     "            )\n",
     "            print_success(f'{shortname} {test_name} test request.')\n",
@@ -524,7 +521,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/test/nsidc-icesat2/version.txt b/test/nsidc-icesat2/version.txt
@@ -1 +1 @@
-0.0.4
+0.0.5
diff --git a/test/shared_utils/README.md b/test/shared_utils/README.md
@@ -1,5 +1,7 @@
 ## This directory contains common utility functions that can be shared across regression tests.
 
+## Include the build arg on the Makefile for your tests
+
 This directory can be included in your test suite by adding a build-arg to the docker build command in the Makefile.
 
 ```sh
@@ -8,11 +10,23 @@ nsidc-icesat2-image: Dockerfile nsidc-icesat2/environment.yaml
 	--build-arg notebook=NSIDC-ICESAT2_Regression.ipynb --build-arg sub_dir=nsidc-icesat2 --build-arg shared_utils=true .
 ```
 
-Doing this will cause this directory and all its files to be included at `/workdir/shared_utils` in your container.
+Doing this will cause this directory and all its files to be included at `/workdir/shared_utils` in your container when you are working locally.
+
+## Update github workflows to include the build arg for your tests.
 
-## Include the necessary python packages in your test's pip_requirements.txt
+To include the shared_utils directory on the regression image built by GitHub you add a `shared_utils` key to the service matrix under your service like was done for the trajectory subsetter in the `.github/workflows/build-all-images.yml` file.
 
-The test environment is determined by the environment.yaml in the test directory, but if you are including `shared_utils` you will need to also include harmony-py and either xarray-datatree or a fancy pinned version of xarray
+```yml
+          -
+            image: "trajectory-subsetter"
+            notebook: "TrajectorySubsetter_Regression.ipynb"
+            shared-utils: "true"
+
+```
+
+## Include the necessary python packages in your test's environment.yaml
+
+The test environment is determined by the environment.yaml in the test directory, but if you are using routines from `shared_utils` you will need to also update your test's `environment.yaml` to include the libraries that are imported in the shared modules. That means `harmony-py` to use routines from utilities.py and a recent version of `xarray` for ones from `compare.py`.  As always you should look in the files to see if there are new requirements.
 
 For example the pip requirements in the nsidc_icesat2 environment file :
 ```
@@ -28,10 +42,9 @@ dependencies:
   - pip
   - pip:
     - harmony-py==0.4.15
-    - git+https://github.com/pydata/xarray.git@ca2e9d6#egg=xarray
+    - xarray==2024.9.0
 ```
 
-
 ## Using the shared utility routines
 
 To use routines from the `shared_utils` dir you need to add the `../shared_utils` directory to the Python module search path using `sys.path.append()` so that the modules will be found.
@@ -45,8 +58,8 @@ from utilities import (
     print_error,
     print_success,
     submit_and_download,
-    compare_results_to_reference_file,
 )
+from compare import compare_results_to_reference_file
 
 print_success('yay! you imported the functions.')
 ```
diff --git a/test/shared_utils/compare.py b/test/shared_utils/compare.py
@@ -0,0 +1,91 @@
+"""A module containing common functionality used by multiple regression
+tests. These functions are kept out of the Jupyter notebook to increase the
+readability of the regression test suite.
+
+This module focuses on comparing output specifically with xarray.
+"""
+
+from itertools import count
+
+
+from xarray.backends.api import open_groups
+from xarray.core.datatree import DataTree
+from xarray import Dataset
+
+
+def compare_results_to_reference_file(
+    results_file_name: str,
+    reference_file_name: str,
+    identical: bool = True,
+    coordinates_to_fix: list[str] | None = None,
+) -> None:
+    """Use `DataTree` functionality to compare data values, variables,
+    coordinates, metadata, and all their corresponding attributes of
+    downloaded results to a reference file.
+
+    """
+    if coordinates_to_fix is None:
+        coordinates_to_fix = []
+
+    reference_groups = open_groups(reference_file_name)
+    results_groups = open_groups(results_file_name)
+
+    # Fix unalignable coordinates
+    for coord in coordinates_to_fix:
+        reference_groups = unalign_groups(reference_groups, coord)
+        results_groups = unalign_groups(results_groups, coord)
+
+    reference_data = DataTree.from_dict(reference_groups)
+    results_data = DataTree.from_dict(results_groups)
+
+    if identical:
+        assert results_data.identical(
+            reference_data
+        ), 'Output and reference files do not match.'
+    else:
+        assert results_data.equals(
+            reference_data
+        ), 'Output and reference files do not match.'
+
+    reference_data = None
+    results_data = None
+
+
+def unalign_groups(
+    dict_of_datasets: dict[str, Dataset], coordinate: str
+) -> dict[str, Dataset]:
+    """Rename coordinates with different dimensions across datasets.
+
+    This function addresses the issue of datasets having coordinates with the
+    same name but different dimensions, which causes problems when creating a
+    DataTree. Specifically for handling data products like ATL04 ICESat2, where
+    common coordinates (e.g., "delta_time") have different lengths across
+    datasets.
+
+    The function renames the specified coordinate in each dataset where it appears,
+    assigning a unique identifier to each instance. This allows for the creation of
+    a DataTree from the modified dictionary of datasets.
+
+    Parameters:
+    -----------
+    dict_of_datasets : dict[str, Dataset]
+        A dictionary of xarray Datasets, typically obtained from xarray.open_groups().
+    coordinate : str
+        The name of the coordinate to be renamed across Datasets.
+
+    Returns:
+    --------
+    dict[str, Dataset]
+        A new dictionary of datasets with the specified coordinate
+        incrementally renamed when present.
+
+    """
+    counter = count(1)
+    return {
+        key: (
+            ds.rename({coordinate: f"{coordinate}_{next(counter)}"})
+            if coordinate in ds.coords
+            else ds
+        )
+        for key, ds in dict_of_datasets.items()
+    }
diff --git a/test/shared_utils/utilities.py b/test/shared_utils/utilities.py
@@ -1,24 +1,14 @@
-""" A module containing common functionality used by multiple regression tests
+""" A module containing common functionality used by multiple
     regression tests. These functions are kept out of the Jupyter notebook to
     increase the readability of the regression test suite.
 
 """
 
 from shutil import move
-from itertools import count
 
 from harmony import Client, Request
 from harmony.harmony import ProcessingFailedException
 
-try:
-    from xarray.backends.api import open_groups
-    from xarray.core.datatree import DataTree
-    from xarray import Dataset
-except Exception:
-    # only used by Trajectory Subsetter tests.
-    # TODO: remove and make Trajectory Subsetter use above
-    from datatree import open_datatree
-
 
 def print_error(error_string: str) -> str:
     """Print an error, with formatting for red text."""
@@ -58,100 +48,3 @@ def submit_and_download(
     except ProcessingFailedException as exception:
         print_error('Harmony request failed to complete successfully.')
         raise exception
-
-
-def compare_results_to_reference_file(
-    results_file_name: str, reference_file_name: str
-) -> None:
-    """Use `DataTree` functionality to compare data values, variables,
-    coordinates, metadata, and all their corresponding attributes of
-    downloaded results to a reference file.
-
-    """
-    reference_data = open_datatree(reference_file_name)
-    results_data = open_datatree(results_file_name)
-
-    assert results_data.identical(reference_data), (
-        'Output and reference files ' 'do not match.'
-    )
-
-    reference_data = None
-    results_data = None
-
-
-def compare_results_to_reference_file_new(
-    results_file_name: str,
-    reference_file_name: str,
-    identical: bool = True,
-    coordinates_to_fix: list[str] | None = None,
-) -> None:
-    """Use `DataTree` functionality to compare data values, variables,
-    coordinates, metadata, and all their corresponding attributes of
-    downloaded results to a reference file.
-
-    """
-    if coordinates_to_fix is None:
-        coordinates_to_fix = []
-
-    reference_groups = open_groups(reference_file_name)
-    results_groups = open_groups(results_file_name)
-
-    # Fix unalignable coordinates
-    for coord in coordinates_to_fix:
-        reference_groups = unalign_groups(reference_groups, coord)
-        results_groups = unalign_groups(results_groups, coord)
-
-    reference_data = DataTree.from_dict(reference_groups)
-    results_data = DataTree.from_dict(results_groups)
-
-    if identical:
-        assert results_data.identical(
-            reference_data
-        ), 'Output and reference files do not match.'
-    else:
-        assert results_data.equals(
-            reference_data
-        ), 'Output and reference files do not match.'
-
-    reference_data = None
-    results_data = None
-
-
-def unalign_groups(
-    dict_of_datasets: dict[str, Dataset], coordinate: str
-) -> dict[str, Dataset]:
-    """Rename coordinates with different dimensions across datasets.
-
-    This function addresses the issue of datasets having coordinates with the
-    same name but different dimensions, which causes problems when creating a
-    DataTree. Specifically for handling data products like ATL04 ICESat2, where
-    common coordinates (e.g., "delta_time") have different lengths across
-    datasets.
-
-    The function renames the specified coordinate in each dataset where it appears,
-    assigning a unique identifier to each instance. This allows for the creation of
-    a DataTree from the modified dictionary of datasets.
-
-    Parameters:
-    -----------
-    dict_of_datasets : dict[str, Dataset]
-        A dictionary of xarray Datasets, typically obtained from xarray.open_groups().
-    coordinate : str
-        The name of the coordinate to be renamed across Datasets.
-
-    Returns:
-    --------
-    dict[str, Dataset]
-        A new dictionary of datasets with the specified coordinate
-        incrementally renamed when present.
-
-    """
-    counter = count(1)
-    return {
-        key: (
-            ds.rename({coordinate: f"{coordinate}_{next(counter)}"})
-            if coordinate in ds.coords
-            else ds
-        )
-        for key, ds in dict_of_datasets.items()
-    }