doc(misc): add doc strings

j-haacker · Sep 10, 2024 · 740684d · 740684d
1 parent 3c709fb
commit 740684d
Showing 1 changed file with 167 additions and 6 deletions.
diff --git a/cryoswath/misc.py b/cryoswath/misc.py
@@ -118,6 +118,13 @@ def cs_time_to_id(time: pd.Timestamp) -> str:
 
 
 def convert_all_esri_to_feather(dir_path: str = None) -> None:
+    """Converts ESRI/ArcGIS formatted files to feathers
+
+    Finds all .shp in given directory. Not recursive.
+
+    Args:
+        dir_path (str, optional): Root directory. Defaults to None.
+    """
     for shp_file in glob.glob("*.shp", root_dir=dir_path):
         try:
             gis.esri_to_feather(os.path.join(dir_path, shp_file))
@@ -174,6 +181,15 @@ def convert_all_esri_to_feather(dir_path: str = None) -> None:
 
 
 def extend_filename(file_name: str, extension: str) -> str:
+    """Adds string at end of file name, before last "."
+
+    Args:
+        file_name (str): File name or path.
+        extension (str): String to insert at end.
+
+    Returns:
+        str: As input, including extension.
+    """
     fn_parts = file_name.split(os.path.extsep)
     return os.path.extsep.join(fn_parts[:-1]) + extension + os.path.extsep + fn_parts[-1]
 __all__.append("extend_filename")
@@ -185,6 +201,22 @@ def filter_kwargs(func: callable,
                   blacklist: list[str] = None,
                   whitelist: list[str] = None,
                   ) -> dict:
+    """Automatically reduces dict to accepted inputs
+
+    Detects expected key-word arguments of a function and only passes
+    those. Use black- and whitelists to refine.
+
+    Args:
+        func (callable): Target function.
+        kwargs (dict): KW-args to be filtered.
+        blacklist (list[str], optional): Blacklist undesired arguments.
+            Defaults to None.
+        whitelist (list[str], optional): Include extra arguments, that are
+            not part of the functions signature. Defaults to None.
+
+    Returns:
+        dict: Filtered kw-args.
+    """
     def ensure_list(tmp_list):
         if tmp_list is None: return []
         elif isinstance(tmp_list, str): return [tmp_list]
@@ -197,6 +229,26 @@ def ensure_list(tmp_list):
 
 
 def find_region_id(location: any, scope: str = "o2") -> str:
+    """Returns RGI id for multitude of inputs
+
+    Special behavior in Greenland! If o2 region is requested, return id of
+    "custom" subregion: 05-11--05-15 for N, W, SW, SE, E. See geo-feathers
+    in `data/auxiliary/RGI/05-1*.feather`.
+
+    Args:
+        location (any): Can be a geo-referenced xarray.DataArray, a
+            geopandas.GeoDataFrame or Series, or a shapely.Geometry.
+        scope (str, optional): One of "o1", "o2", or "basin". Defaults to
+            "o2".
+
+    Raises:
+        Exception: `scope` is "o2" and `location` is in Greenland but
+            - not in one of the custom subregions or
+            - in more than one custom subregion.
+
+    Returns:
+        str: RGI id.
+    """
     if isinstance(location, xr.DataArray) or isinstance(location, xr.Dataset):
         left, lower, right, upper = location.rio.transform_bounds(4326)
         location = shapely.Point(left+(right-left)/2, lower+(upper-lower)/2)
@@ -239,7 +291,7 @@ def flag_outliers(data, *,
                   stat: callable = np.median,
                   deviation_factor: float = 3,
                   scaling_factor: float = 2*2**.5*scipy.special.erfinv(.5)):
-    """Flags data that is considered outlier given a set of assumptions.
+    """Flags data that is considered outlier given a set of assumptions
 
     Data too far from a reference point is marked. Works analogous comparing
     data to its mean in terms of standard deviations.
@@ -333,6 +385,21 @@ def gauss_filter_DataArray(da, dim, window_extent, std):
 
 
 def get_dem_reader(data: any = None) -> rasterio.DatasetReader:
+    """Determines which DEM to use
+
+    Attempts to determine location of `data` and returns appropriate
+    `rasterio.io.DatasetReader`. Only implemented for ArcticDEM and
+    REMA.
+
+    Args:
+        data (any): Defaults to None.
+
+    Raises:
+        NotImplementedError: If region can't be inferred.
+
+    Returns:
+        rasterio.DatasetReader: Reader pointing to the file.
+    """
     if isinstance(data, shapely.Geometry):
         lat = np.mean(data.bounds[1::2])
     elif isinstance(data, float) \
@@ -363,15 +430,45 @@ def interpolate_hypsometrically(ds: xr.Dataset,
                                 error: str,
                                 elev: str = "ref_elev",
                                 weights: str = "weights",
-                                degree: int = 3,
                                 outlier_replace: bool = False,
                                 ) -> xr.Dataset:
+    """Fills data gaps by hypsometrical interpolation
+
+    If sufficient data is provided, this routine sorts and bins the data by
+    elevation bands and fits a third-order polynomial to the weighted
+    averages.
+
+    Sufficient data requires 4 or more bands, with an effective sample size
+    of 6 or larger, that span at least 2/3 of the total elevation range.
+    The weights used to calculate the weighted average are the reciprocal
+    squared errors if no weights are provided.
+
+    If dimension "time" exists, recurse into time steps and interpolate per
+    time step.
+
+    Args:
+        ds (xr.Dataset): Input with voids.
+        main_var (str): Name of variable to interpolate.
+        error (str): Name of errors. Only used, if weights are not provided.
+        elev (str, optional): Name of variable that contains the reference
+            elevation used for binning. If the variable does not exist, it
+            is attempted to read the reference elevations from disk.
+            Defaults to "ref_elev".
+        weights (str, optional): Provide name of variable that contains the
+            weights. The weights will be passed to `numpy.average` and
+            should be 1/variance or similar. Defaults to "weights".
+        outlier_replace (bool, optional): If enabled, also interpolates
+            outliers. Defaults to False.
+
+    Returns:
+        xr.Dataset: Filled dataset.
+    """
     if "time" in ds.dims and len(ds.time) > 1:
         # note: `groupby("time")` creates time depencies for all data_vars. this
         #       requires taking note of those data_vars that do not depend on
         #       time and reset those after the operation
         no_time_dep = [data_var for data_var in ds.data_vars if not "time" in ds[data_var].dims]
-        ds = ds.groupby("time", squeeze=False).apply(interpolate_hypsometrically, main_var=main_var, elev=elev, error=error, degree=degree, outlier_replace=outlier_replace)
+        ds = ds.groupby("time", squeeze=False).apply(interpolate_hypsometrically, main_var=main_var, elev=elev, error=error, outlier_replace=outlier_replace)
         ds[no_time_dep] = ds[no_time_dep].isel(time=0)
         return ds
     # assign weights if not present. use previously assigned weights to
@@ -380,7 +477,7 @@ def interpolate_hypsometrically(ds: xr.Dataset,
         ds[weights] = 1/ds[error]**2
     # abort if too little data (checking elevation and data validity).
     # necessary to prevent errors but also introduces data gaps
-    if ds[elev].where(ds[error]>0).count() < degree*3:
+    if ds[elev].where(ds[error]>0).count() < 24:
         # print("too little data")
         return ds
     # also, abort if there isn't anything to do
@@ -723,7 +820,7 @@ def collect_missing_tracks(remote_files: list[str],
 
 
 def load_o1region(o1code: str, product: str = "complexes") -> gpd.GeoDataFrame:
-    """Loads RGI v7 basin or complex outlines and meta data.
+    """Loads RGI v7 basin or complex outlines and meta data
 
     Args:
         o1code (str): starting with "01".."20"
@@ -777,6 +874,17 @@ def load_o1region(o1code: str, product: str = "complexes") -> gpd.GeoDataFrame:
 
 
 def load_o2region(o2code: str, product: str = "complexes") -> gpd.GeoDataFrame:
+    """Loads RGI v7 basin or complex outlines and meta data
+
+    Args:
+        o2code (str): RGI o2 code.
+        product (str, optional): Either "glaciers" or "complexes". Defaults
+            to "complexes".
+
+    Returns:
+        gpd.GeoDataFrame: Queried RGI data with geometry column containing
+            the outlines.
+    """
     o1region = load_o1region(o2code[:2], product)
     # special handling for greenland periphery
     if o2code.startswith("05") and not o2code.endswith("01"):
@@ -785,6 +893,15 @@ def load_o2region(o2code: str, product: str = "complexes") -> gpd.GeoDataFrame:
 
 
 def load_basins(rgi_ids: list[str]) -> gpd.GeoDataFrame:
+    """Loads RGI v7 basin or complex outlines and meta data
+
+    Args:
+        rgi_ids (list[str]): RGI basin ids.
+
+    Returns:
+        gpd.GeoDataFrame: Queried RGI data with geometry column containing
+            the outlines.
+    """
     if len(rgi_ids) > 1:
         assert(all([id[:17]==rgi_ids[0][:17]] for id in rgi_ids))
     rgi_o1_gpdf = load_o1region(rgi_ids[0].split("-")[3], product="glaciers")
@@ -797,6 +914,26 @@ def load_glacier_outlines(identifier: str|list[str],
                           union: bool = True,
                           crs: int|CRS = None,
                           ) -> shapely.MultiPolygon:
+    """Loads RGI v7 basin or complex outlines and meta data
+
+    Args:
+        identifier (str | list[str]): RGI id: either o1, o2, or
+            basin/complex id.
+        product (str, optional): Either "glaciers" or "complexes". Defaults
+            to "complexes".
+        union (bool, optional): For backward compatibility, if enabled (by
+            default) only return union of all shapes. If disabled, return
+            full GeoDataFrame. Defaults to True.
+        crs (int | CRS, optional): Convenience option to reproject shape(s)
+            to crs. Defaults to None.
+
+    Raises:
+        ValueError: If identifier was not understood.
+
+    Returns:
+        shapely.MultiPolygon: Union of basin shapes. If `union` is disabled,
+            instead return geopandas.GeoDataFrame including the full data.
+    """
     if isinstance(identifier, list):
         out = load_basins(identifier)
     elif len(identifier) == (7+4+1+2+5+4) and identifier.split("-")[:3] == ["RGI2000", "v4.1", "G"]:
@@ -955,7 +1092,20 @@ def move_node(name, node):
 __all__.append("repair_l2_cache")
 
 
-def rgi_code_translator(input: str, out_type: str = "full_name") -> str:
+def rgi_code_translator(input: str|list[str], out_type: str = "full_name") -> str:
+    """Translate o1 or o2 codes to region names
+
+    Args:
+        input (str): RGI o1 or o2 codes.
+        out_type (str, optional): Either "full_name" or "long_code".
+            Defaults to "full_name".
+
+    Raises:
+        ValueError: If input is not understood.
+
+    Returns:
+        str: Either full name or RGI "long_code".
+    """
     if isinstance(input, list):
         return [rgi_code_translator(element, out_type) for element in input]
     if isinstance(input, int) or len(input) <= 2 and int(input) < 20:
@@ -1044,6 +1194,17 @@ def weighted_mean_excl_outliers(df: pd.DataFrame|xr.Dataset = None,
 
 def xycut(data: gpd.GeoDataFrame, x_chunk_meter = 3*4*5*1_000, y_chunk_meter = 3*4*5*1_000)\
     -> list[dict[str, Union[float, gpd.GeoDataFrame]]]:
+    """Chunk point data in planar reference system
+
+    This mainly is a helper function for `l3.build_dataset()` that takes
+    many data points and chunks them based on their location.
+    However, it may be helpful in other contexts.
+
+    Returns:
+        list: List of dicts of which each contains the x and y extents of
+            the current chunk and the GeoDataFrame or Series of the point
+            data.
+    """
     # 3*4*5=60 [km] fits many grid cell sizes and makes reasonable chunks
     # ! only implemented for l2 data. however, easily convertible for l1b and l3 data