Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue 235 remove dask dependency #236

Merged
merged 7 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed
- Allow single netCDF file input in addition to single text file listings ([#230](https://github.com/nasa/stitchee/issues/230))([**@danielfromearth**](https://github.com/danielfromearth))
- Remove the dask dependency ([#235](https://github.com/nasa/stitchee/issues/235))([**@danielfromearth**](https://github.com/danielfromearth))


## [1.3.0] - 2024-07-11
Expand Down
7 changes: 2 additions & 5 deletions concatenator/attribute_handling.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
"""
attribute_handling.py

Functions for converting "coordinates" in netCDF variable attributes
between paths that reference a group hierarchy and flattened paths.
"""Functions for converting "coordinates" in netCDF variable attributes
between paths that reference a group hierarchy and flattened paths.
"""

import json
Expand Down
25 changes: 15 additions & 10 deletions concatenator/dataset_and_group_handling.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
"""
dataset_and_group_handling.py

Functions for converting multidimensional data structures
between a group hierarchy and a flat structure
"""
"""Functions to convert data structures between a group hierarchy and a flat structure."""

from __future__ import annotations

import logging
import re
from logging import Logger

Expand All @@ -20,6 +16,8 @@
regroup_coordinate_attribute,
)

module_logger = logging.getLogger(__name__)

# Match dimension names such as "__char28" or "__char16". Used for CERES datasets.
_string_dimension_name_pattern = re.compile(r"__char[0-9]+")

Expand Down Expand Up @@ -51,7 +49,7 @@ def walk(
var_group_name = f"{group_path}{concatenator.group_delim}{var_name}"
new_dataset.variables[var_group_name] = var

# Flatten the paths of variables referenced in the coordinates attribute
# Flatten the paths of variables referenced in the 'coordinates' attribute
flatten_coordinate_attribute_paths(new_dataset, var, var_group_name)

if (len(var.dimensions) == 1) and _string_dimension_name_pattern.fullmatch(
Expand Down Expand Up @@ -99,6 +97,7 @@ def flatten_grouped_dataset(
----------
nc_dataset : nc.Dataset
netCDF4 Dataset that contains groups
ensure_all_dims_are_coords

Returns
-------
Expand All @@ -124,7 +123,7 @@ def flatten_grouped_dataset(
# Copy variables to root group with new name
nc_dataset.variables[new_var_name] = var

# Flatten the paths of variables referenced in the coordinates attribute.
# Flatten the paths of variables referenced in the 'coordinates' attribute.
flatten_coordinate_attribute_paths(nc_dataset, var, new_var_name)

del nc_dataset.variables[var_name] # Delete old variable
Expand Down Expand Up @@ -173,6 +172,7 @@ def regroup_flattened_dataset(
List of xarray datasets to be combined
output_file : str
Name of the output file to write the resulting NetCDF file to.
history_to_append : str
"""
with nc.Dataset(output_file, mode="w", format="NETCDF4") as base_dataset:
# Copy global attributes
Expand Down Expand Up @@ -325,7 +325,9 @@ def _get_dimension_size(dataset: nc.Dataset, dim_name: str) -> int:
return dim_size


def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str], int]:
def validate_workable_files(
files: list[str], logger: Logger | None = module_logger
) -> tuple[list[str], int]:
"""Remove files from a list that are not open-able as netCDF or that are empty."""
workable_files = []
for file in files:
Expand All @@ -335,7 +337,10 @@ def validate_workable_files(files: list[str], logger: Logger) -> tuple[list[str]
if is_empty is False:
workable_files.append(file)
except OSError:
logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file)
if logger:
logger.debug("Error opening <%s> as a netCDF dataset. Skipping.", file)
else:
print("Error opening <%s> as a netCDF dataset. Skipping.")

# addressing GitHub issue 153: propagate the first empty file if all input files are empty
if (len(workable_files)) == 0 and (len(files) > 0):
Expand Down
53 changes: 31 additions & 22 deletions concatenator/dimension_cleanup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
dimension_cleanup.py
"""Functions for renaming duplicated dimension names for netCDF variables.

Functions for renaming duplicated dimension names for netCDF variables, so that xarray can handle the dataset.
So that xarray can handle the dataset.
"""

import collections

import netCDF4 as nc
Expand All @@ -29,16 +29,20 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims

for dup_var_name, dup_var in dup_vars.items():
dim_list = list(dup_var.dimensions) # original dimensions of the variable with duplicated dims
dim_list = list(
dup_var.dimensions
) # original dimensions of the variable with duplicated dims

# Dimension(s) that are duplicated are retrieved.
# Note: this is not yet tested for more than one duplicated dimension.
dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
dim_dup_length = dup_var.shape[dup_var.dimensions.index(dim_dup)] # length of the duplicated dimension
dim_dup_length = dup_var.shape[
dup_var.dimensions.index(dim_dup)
] # length of the duplicated dimension

# New dimension and variable names are created.
dim_dup_new = dim_dup + '_1'
var_name_new = dup_var_name + '_1'
dim_dup_new = dim_dup + "_1"
var_name_new = dup_var_name + "_1"
dup_new_varnames.append(var_name_new)

# The last dimension for the variable is replaced with the new name in a temporary list.
Expand All @@ -48,9 +52,9 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
new_dup_var = {}

# Attributes for the original variable are retrieved.
attrs_contents = get_attributes_minus_fillvalue_and_renamed_coords(original_var_name=dup_var_name,
new_var_name=dim_dup_new,
original_dataset=nc_dataset)
attrs_contents = get_attributes_minus_fillvalue_and_renamed_coords(
original_var_name=dup_var_name, new_var_name=dim_dup_new, original_dataset=nc_dataset
)
# for attrname in dup_var.ncattrs():
# if attrname != '_FillValue':
# contents: str = nc_dataset.variables[dup_var_name].getncattr(attrname)
Expand All @@ -71,11 +75,15 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
if dim_dup in nc_dataset.variables.keys():

# New variable object is created for the renamed, previously duplicated dimension.
new_dup_var[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,), fill_value=fill_value)
dim_var_attr_contents = get_attributes_minus_fillvalue_and_renamed_coords(original_var_name=dim_dup,
new_var_name=dim_dup_new,
original_dataset=nc_dataset)
new_dup_var[dim_dup_new] = nc_dataset.createVariable(
dim_dup_new,
nc_dataset.variables[dim_dup].dtype,
(dim_dup_new,),
fill_value=fill_value,
)
dim_var_attr_contents = get_attributes_minus_fillvalue_and_renamed_coords(
original_var_name=dim_dup, new_var_name=dim_dup_new, original_dataset=nc_dataset
)
for attr_name, contents in dim_var_attr_contents.items():
new_dup_var[dim_dup_new].setncattr(attr_name, contents)

Expand All @@ -85,25 +93,26 @@ def remove_duplicate_dims(nc_dataset: nc.Dataset) -> nc.Dataset:
del nc_dataset.variables[dup_var_name]

# Replace original *Variable* with new variable with no duplicated dimensions.
new_dup_var[dup_var_name] = nc_dataset.createVariable(dup_var_name, str(dup_var[:].dtype),
tuple(new_dim_list), fill_value=fill_value)
new_dup_var[dup_var_name] = nc_dataset.createVariable(
dup_var_name, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value
)
for attr_name, contents in attrs_contents.items():
new_dup_var[dup_var_name].setncattr(attr_name, contents)
new_dup_var[dup_var_name][:] = dup_var[:]

return nc_dataset


def get_attributes_minus_fillvalue_and_renamed_coords(original_var_name: str,
new_var_name: str,
original_dataset: nc.Dataset) -> dict:
def get_attributes_minus_fillvalue_and_renamed_coords(
original_var_name: str, new_var_name: str, original_dataset: nc.Dataset
) -> dict:
"""Variable attributes are retrieved."""
attrs_contents = {}

for ncattr in original_dataset.variables[original_var_name].ncattrs():
if ncattr != '_FillValue':
if ncattr != "_FillValue":
contents: str = original_dataset.variables[original_var_name].getncattr(ncattr)
if ncattr == 'coordinates':
if ncattr == "coordinates":
contents.replace(original_var_name, new_var_name)
attrs_contents[ncattr] = contents

Expand Down
Loading