google · j9sh264 · Dec 7, 2023 · Dec 7, 2022 · Dec 16, 2022 · Mar 1, 2023
diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py
@@ -133,6 +133,10 @@ def validate_arguments(cls, known_args: argparse.Namespace, pipeline_args: t.Lis
         if known_args.area:
             assert len(known_args.area) == 4, 'Must specify exactly 4 lat/long values for area: N, W, S, E boundaries.'
 
+        # Add a check for group_common_hypercubes.
+        if pipeline_options_dict.get('group_common_hypercubes'):
+            raise RuntimeError('--group_common_hypercubes can be specified only for earth engine ingestions.')
+
         # Check that all arguments are supplied for COG input.
         _, uri_extension = os.path.splitext(known_args.uris)
         if uri_extension == '.tif' and not known_args.tif_metadata_for_datetime:

diff --git a/weather_mv/loader_pipeline/ee.py b/weather_mv/loader_pipeline/ee.py
@@ -233,6 +233,7 @@ class ToEarthEngine(ToDataSink):
     ee_qps: int
     ee_latency: float
     ee_max_concurrent: int
+    group_common_hypercubes: bool
     band_names_mapping: str
     initialization_time_regex: str
     forecast_time_regex: str
@@ -268,6 +269,8 @@ def add_parser_arguments(cls, subparser: argparse.ArgumentParser):
                                help='The expected latency per requests, in seconds. Default: 0.5')
         subparser.add_argument('--ee_max_concurrent', type=int, default=10,
                                help='Maximum concurrent api requests to EE allowed for your project. Default: 10')
+        subparser.add_argument('--group_common_hypercubes', action='store_true', default=False,
+                               help='To group common hypercubes into image collections when loading grib data.')
         subparser.add_argument('--band_names_mapping', type=str, default=None,
                                help='A JSON file which contains the band names for the TIFF file.')
         subparser.add_argument('--initialization_time_regex', type=str, default=None,
@@ -417,6 +420,7 @@ class ConvertToAsset(beam.DoFn, beam.PTransform, KwargsFactoryMixin):
     ee_asset_type: str = 'IMAGE'
     open_dataset_kwargs: t.Optional[t.Dict] = None
     disable_grib_schema_normalization: bool = False
+    group_common_hypercubes: t.Optional[bool] = False
     band_names_dict: t.Optional[t.Dict] = None
     initialization_time_regex: t.Optional[str] = None
     forecast_time_regex: t.Optional[str] = None
@@ -436,80 +440,86 @@ def convert_to_asset(self, queue: Queue, uri: str):
         with open_dataset(uri,
                           self.open_dataset_kwargs,
                           self.disable_grib_schema_normalization,
+                          group_common_hypercubes=self.group_common_hypercubes,
                           band_names_dict=self.band_names_dict,
                           initialization_time_regex=self.initialization_time_regex,
-                          forecast_time_regex=self.forecast_time_regex) as ds:
-
-            attrs = ds.attrs
-            data = list(ds.values())
-            asset_name = get_ee_safe_name(uri)
-            channel_names = [da.name for da in data]
-            start_time, end_time, is_normalized = (attrs.get(key) for key in
-                                                   ('start_time', 'end_time', 'is_normalized'))
-            dtype, crs, transform = (attrs.pop(key) for key in ['dtype', 'crs', 'transform'])
-            attrs.update({'is_normalized': str(is_normalized)})  # EE properties does not support bool.
-            # Make attrs EE ingestable.
-            attrs = make_attrs_ee_compatible(attrs)
-
-            # For tiff ingestions.
-            if self.ee_asset_type == 'IMAGE':
-                file_name = f'{asset_name}.tiff'
-
-                with MemoryFile() as memfile:
-                    with memfile.open(driver='COG',
-                                      dtype=dtype,
-                                      width=data[0].data.shape[1],
-                                      height=data[0].data.shape[0],
-                                      count=len(data),
-                                      nodata=np.nan,
-                                      crs=crs,
-                                      transform=transform,
-                                      compress='lzw') as f:
-                        for i, da in enumerate(data):
-                            f.write(da, i+1)
-                            # Making the channel name EE-safe before adding it as a band name.
-                            f.set_band_description(i+1, get_ee_safe_name(channel_names[i]))
-                            f.update_tags(i+1, band_name=channel_names[i])
-                            f.update_tags(i+1, **da.attrs)
-                        # Write attributes as tags in tiff.
-                        f.update_tags(**attrs)
-
-                    # Copy in-memory tiff to gcs.
+                          forecast_time_regex=self.forecast_time_regex) as ds_list:
+            if not isinstance(ds_list, list):
+                ds_list = [ds_list]
+
+            for ds in ds_list:
+                attrs = ds.attrs
+                data = list(ds.values())
+                asset_name = get_ee_safe_name(uri)
+                channel_names = [da.name for da in data]
+                start_time, end_time, is_normalized = (attrs.get(key) for key in
+                                                       ('start_time', 'end_time', 'is_normalized'))
+                dtype, crs, transform = (attrs.pop(key) for key in ['dtype', 'crs', 'transform'])
+                attrs.update({'is_normalized': str(is_normalized)})  # EE properties does not support bool.
+                # Make attrs EE ingestable.
+                attrs = make_attrs_ee_compatible(attrs)
+
+                if self.group_common_hypercubes:
+                    level, height = (attrs.pop(key) for key in ['level', 'height'])
+                    safe_level_name = get_ee_safe_name(level)
+                    asset_name = f'{asset_name}_{safe_level_name}'
+
+                # For tiff ingestions.
+                if self.ee_asset_type == 'IMAGE':
+                    file_name = f'{asset_name}.tiff'
+
+                    with MemoryFile() as memfile:
+                        with memfile.open(driver='COG',
+                                          dtype=dtype,
+                                          width=data[0].data.shape[1],
+                                          height=data[0].data.shape[0],
+                                          count=len(data),
+                                          nodata=np.nan,
+                                          crs=crs,
+                                          transform=transform,
+                                          compress='lzw') as f:
+                            for i, da in enumerate(data):
+                                f.write(da, i+1)
+                                # Making the channel name EE-safe before adding it as a band name.
+                                f.set_band_description(i+1, get_ee_safe_name(channel_names[i]))
+                                f.update_tags(i+1, band_name=channel_names[i])
+                                f.update_tags(i+1, **da.attrs)
+
+                            # Write attributes as tags in tiff.
+                            f.update_tags(**attrs)
+
+                        # Copy in-memory tiff to gcs.
+                        target_path = os.path.join(self.asset_location, file_name)
+                        with FileSystems().create(target_path) as dst:
+                            shutil.copyfileobj(memfile, dst, WRITE_CHUNK_SIZE)
+                # For feature collection ingestions.
+                elif self.ee_asset_type == 'TABLE':
+                    channel_names = []
+                    file_name = f'{asset_name}.csv'
+
+                    df = xr.Dataset.to_dataframe(ds)
+                    df = df.reset_index()
+
+                    # Copy in-memory dataframe to gcs.
                     target_path = os.path.join(self.asset_location, file_name)
-                    with FileSystems().create(target_path) as dst:
-                        shutil.copyfileobj(memfile, dst, WRITE_CHUNK_SIZE)
-            # For feature collection ingestions.
-            elif self.ee_asset_type == 'TABLE':
-                channel_names = []
-                file_name = f'{asset_name}.csv'
-
-                df = xr.Dataset.to_dataframe(ds)
-                df = df.reset_index()
-                # NULL and NaN create data-type mismatch issue in ee therefore replacing all of them.
-                # fillna fills in NaNs, NULLs, and NaTs but we have to exclude NaTs.
-                non_nat = df.select_dtypes(exclude=['datetime', 'timedelta', 'datetimetz'])
-                df[non_nat.columns] = non_nat.fillna(-9999)
-
-                # Copy in-memory dataframe to gcs.
-                target_path = os.path.join(self.asset_location, file_name)
-                with tempfile.NamedTemporaryFile() as tmp_df:
-                    df.to_csv(tmp_df.name, index=False)
-                    tmp_df.flush()
-                    tmp_df.seek(0)
-                    with FileSystems().create(target_path) as dst:
-                        shutil.copyfileobj(tmp_df, dst, WRITE_CHUNK_SIZE)
-
-            asset_data = AssetData(
-                name=asset_name,
-                target_path=target_path,
-                channel_names=channel_names,
-                start_time=start_time,
-                end_time=end_time,
-                properties=attrs
-            )
-
-            self.add_to_queue(queue, asset_data)
-            self.add_to_queue(queue, None)  # Indicates end of the subprocess.
+                    with tempfile.NamedTemporaryFile() as tmp_df:
+                        df.to_csv(tmp_df.name, index=False)
+                        tmp_df.flush()
+                        tmp_df.seek(0)
+                        with FileSystems().create(target_path) as dst:
+                            shutil.copyfileobj(tmp_df, dst, WRITE_CHUNK_SIZE)
+
+                asset_data = AssetData(
+                    name=asset_name,
+                    target_path=target_path,
+                    channel_names=channel_names,
+                    start_time=start_time,
+                    end_time=end_time,
+                    properties=attrs
+                )
+
+                self.add_to_queue(queue, asset_data)
+                self.add_to_queue(queue, None)  # Indicates end of the subprocess.
 
     def process(self, uri: str) -> t.Iterator[AssetData]:
         """Opens grib files and yields AssetData.

diff --git a/weather_mv/loader_pipeline/sinks.py b/weather_mv/loader_pipeline/sinks.py
@@ -220,9 +220,12 @@ def _is_3d_da(da):
     return len(da.shape) == 3
 
 
-def __normalize_grib_dataset(filename: str) -> xr.Dataset:
+def __normalize_grib_dataset(filename: str,
+                             group_common_hypercubes: t.Optional[bool] = False) -> t.Union[xr.Dataset,
+                                                                                           t.List[xr.Dataset]]:
     """Reads a list of datasets and merge them into a single dataset."""
-    _data_array_list = []
+    _level_data_dict = {}
+
     list_ds = cfgrib.open_datasets(filename)
     ds_attrs = list_ds[0].attrs
     dv_units_dict = {}
@@ -250,6 +253,13 @@ def __normalize_grib_dataset(filename: str) -> xr.Dataset:
             attrs['start_time'] = start_time
             attrs['end_time'] = end_time
 
+            if group_common_hypercubes:
+                attrs['level'] = level  # Adding the level in the metadata, will remove in further steps.
+                attrs['is_normalized'] = True  # Adding the 'is_normalized' attribute in the metadata.
+
+            if level not in _level_data_dict:
+                _level_data_dict[level] = []
+
             no_of_levels = da.shape[0] if _is_3d_da(da) else 1
 
             # Deal with the randomness that is 3d data interspersed with 2d.
@@ -271,7 +281,8 @@ def __normalize_grib_dataset(filename: str) -> xr.Dataset:
                 logger.debug('Found channel %s', channel_name)
 
                 # Add the height as a metadata field, that seems useful.
-                copied_da.attrs['height'] = height
+                copied_da.attrs['height'] = height_string
+
                 # Add the units of each band as a metadata field.
                 dv_units_dict['unit_'+channel_name] = None
                 if 'units' in attrs:
@@ -281,26 +292,44 @@ def __normalize_grib_dataset(filename: str) -> xr.Dataset:
                 if _is_3d_da(da):
                     copied_da = copied_da.sel({level: height})
                 copied_da = copied_da.drop_vars(level)
-                _data_array_list.append(copied_da)
 
-    # Stick the forecast hour, start_time, end_time, data variables units
-    # in the ds attrs as well, that's useful.
-    ds_attrs['forecast_hour'] = _data_array_list[0].attrs['forecast_hour']
-    ds_attrs['start_time'] = _data_array_list[0].attrs['start_time']
-    ds_attrs['end_time'] = _data_array_list[0].attrs['end_time']
-    ds_attrs.update(**dv_units_dict)
+                _level_data_dict[level].append(copied_da)
+
+    _data_array_list = []
+    for level, ds in _level_data_dict.items():
+        if len(ds) == 1:
+            dataset = ds[0].to_dataset(promote_attrs=True)
+        else:
+            dataset = xr.merge(ds)
+        _data_array_list.append(dataset)
+
+    if not group_common_hypercubes:
+        # Stick the forecast hour, start_time, end_time, data variables units
+        # in the ds attrs as well, that's useful.
+        ds_attrs['forecast_hour'] = _data_array_list[0].attrs['forecast_hour']
+        ds_attrs['start_time'] = _data_array_list[0].attrs['start_time']
+        ds_attrs['end_time'] = _data_array_list[0].attrs['end_time']
+        ds_attrs.update(**dv_units_dict)
+
+        merged_dataset = xr.merge(_data_array_list)
+        merged_dataset.attrs.clear()
+        merged_dataset.attrs.update(ds_attrs)
+        return merged_dataset
 
-    merged_dataset = xr.merge(_data_array_list)
-    merged_dataset.attrs.clear()
-    merged_dataset.attrs.update(ds_attrs)
-    return merged_dataset
+    return _data_array_list
 
 
 def __open_dataset_file(filename: str,
                         uri_extension: str,
                         disable_grib_schema_normalization: bool,
-                        open_dataset_kwargs: t.Optional[t.Dict] = None) -> xr.Dataset:
+                        open_dataset_kwargs: t.Optional[t.Dict] = None,
+                        group_common_hypercubes: t.Optional[bool] = False) -> t.Union[xr.Dataset, t.List[xr.Dataset]]:
     """Opens the dataset at 'uri' and returns a xarray.Dataset."""
+    # add a flag to group common hypercubes
+    if group_common_hypercubes:
+        return __normalize_grib_dataset(filename, group_common_hypercubes)
+
+    # add a flag to use cfgrib
     if open_dataset_kwargs:
         return _add_is_normalized_attr(xr.open_dataset(filename, **open_dataset_kwargs), False)
 
@@ -380,6 +409,7 @@ def open_dataset(uri: str,
                  open_dataset_kwargs: t.Optional[t.Dict] = None,
                  disable_grib_schema_normalization: bool = False,
                  tif_metadata_for_datetime: t.Optional[str] = None,
+                 group_common_hypercubes: t.Optional[bool] = False,
                  band_names_dict: t.Optional[t.Dict] = None,
                  initialization_time_regex: t.Optional[str] = None,
                  forecast_time_regex: t.Optional[str] = None,
@@ -394,29 +424,42 @@ def open_dataset(uri: str,
             return
         with open_local(uri) as local_path:
             _, uri_extension = os.path.splitext(uri)
-            xr_dataset: xr.Dataset = __open_dataset_file(local_path,
-                                                         uri_extension,
-                                                         disable_grib_schema_normalization,
-                                                         open_dataset_kwargs)
-            if uri_extension in ['.tif', '.tiff']:
-                xr_dataset = _preprocess_tif(xr_dataset,
+            xr_datasets: xr.Dataset = __open_dataset_file(local_path,
+                                                          uri_extension,
+                                                          disable_grib_schema_normalization,
+                                                          open_dataset_kwargs,
+                                                          group_common_hypercubes)
+            # Extracting dtype, crs and transform from the dataset.
+            with rasterio.open(local_path, 'r') as f:
+                dtype, crs, transform = (f.profile.get(key) for key in ['dtype', 'crs', 'transform'])
+
+            if group_common_hypercubes:
+                total_size_in_bytes = 0
+
+                for xr_dataset in xr_datasets:
+                    xr_dataset.attrs.update({'dtype': dtype, 'crs': crs, 'transform': transform})
+                    total_size_in_bytes += xr_dataset.nbytes
+
+                logger.info(f'opened dataset size: {total_size_in_bytes}')
+            elif uri_extension in ['.tif', '.tiff']:
+                xr_dataset = _preprocess_tif(xr_datasets,
                                              local_path,
                                              tif_metadata_for_datetime,
                                              uri,
                                              band_names_dict,
                                              initialization_time_regex,
                                              forecast_time_regex)
+            else:
+                xr_dataset = xr_datasets
 
-            # Extracting dtype, crs and transform from the dataset & storing them as attributes.
-            with rasterio.open(local_path, 'r') as f:
-                dtype, crs, transform = (f.profile.get(key) for key in ['dtype', 'crs', 'transform'])
-                xr_dataset.attrs.update({'dtype': dtype, 'crs': crs, 'transform': transform})
+            xr_dataset.attrs.update({'dtype': dtype, 'crs': crs, 'transform': transform})
 
             logger.info(f'opened dataset size: {xr_dataset.nbytes}')
 
             beam.metrics.Metrics.counter('Success', 'ReadNetcdfData').inc()
-            yield xr_dataset
+            yield xr_datasets if group_common_hypercubes else xr_dataset
             xr_dataset.close()
+
     except Exception as e:
         beam.metrics.Metrics.counter('Failure', 'ReadNetcdfData').inc()
         logger.error(f'Unable to open file {uri!r}: {e}')

diff --git a/weather_mv/loader_pipeline/sinks_test.py b/weather_mv/loader_pipeline/sinks_test.py
@@ -86,6 +86,7 @@ def setUp(self) -> None:
         self.test_grib_path = os.path.join(self.test_data_folder, 'test_data_grib_single_timestep')
         self.test_tif_path = os.path.join(self.test_data_folder, 'test_data_tif_start_time.tif')
         self.test_zarr_path = os.path.join(self.test_data_folder, 'test_data.zarr')
+        self.test_grib_multi_level_path = os.path.join(self.test_data_folder, 'test_data_grib_multi_levels.grib2')
 
     def test_opens_grib_files(self):
         with open_dataset(self.test_grib_path) as ds1:
@@ -118,6 +119,11 @@ def test_open_dataset__fits_memory_bounds(self):
                 with open_dataset(test_netcdf_path) as _:
                     pass
 
+    def test_group_common_hypercubes(self):
+        with open_dataset(self.test_grib_multi_level_path,
+                          group_common_hypercubes=True) as ds:
+            self.assertEqual(isinstance(ds, list), True)
+
 
 class DatetimeTest(unittest.TestCase):
 

diff --git a/weather_mv/test_data/test_data_grib_multi_levels.grib2 b/weather_mv/test_data/test_data_grib_multi_levels.grib2