Merge pull request #22 from scottstanie/orbit-step-2

Move orbit download to step 2
isce-framework · Aug 25, 2023 · bac0e11 · bac0e11
2 parents f1b9245 + 0f810f1
commit bac0e11
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -3,24 +3,28 @@
 
 Workflow for creating unwrapped interferograms from Sentinel-1 geocoded SLCs.
 
+## Install
 
+`sweets` is available to install via conda-forge:
 
-## Install
+```bash
+mamba install -c conda-forge sweets
+```
 
-The following will install `sweets` into a conda environment.
+Alternatively, the following will install `sweets` into a conda environment.
 
 1. Download source code:
 ```bash
 git clone https://github.com/opera-adt/sweets.git && cd sweets
 ```
 2. Install dependencies:
 ```bash
-conda env create --file conda-env.yml
+mamba env create --file conda-env.yml
 ```
 
 or if you have an existing environment:
 ```bash
-conda env update --name my-existing-env --file conda-env.yml
+mambba env update --name my-existing-env --file conda-env.yml
 ```
 
 3. Install `sweets` via pip:
@@ -29,27 +33,32 @@ conda activate sweets-env
 python -m pip install .
 ```
 
-
 ## Usage
 
 From the command line, installing will create a `sweets` executable. You can run `sweets --help` to see the available options.
 ```bash
 sweets --help
 ```
-You need to specify the bounding box (left, bottom, right, top) of the area of interest, the start dates (and end date, or it is assumed to be today), and the track number.
+To configure a workflow, the minimum inputs are
+- the bounding box of the area of interest in degrees longitude/latitude as (left, bottom right top)
+- the start date (and end date, or it is assumed to be today)
+- the track (relative orbit) number.
+
+For example:
 
 ```bash
-sweets config  --bbox -102.3407 31.9909 -101.9407 32.3909 --start "2022-10-15" --track 78
-sweets run sweets_config.yaml
+sweets config  --bbox -102.2 32.15 -102.1 32.22 --start 2022-12-15 --end 2022-12-29 --track 78
 ```
+This will make a YAML configuration file (by default `sweets_config.yaml`). You can inspect it to see all the configuration defaults.
 
-You can also print an empty config file to edit any parameters manually
-
+Then you can kick off the workflow using
 ```bash
-sweets config --print-empty
+sweets run sweets_config.yaml
 ```
 
-Or you can set all the parameters in python:
+### Configuration from Python
+
+Alternatively, you can configure everything in python:
 ```python
 from sweets.core import Workflow
 bbox = (-102.3407 31.9909 -101.9407 32.3909)
@@ -70,7 +79,11 @@ w = Workflow.from_yaml("sweets_config.yml")
 w.run()
 ```
 
+You can also print an empty config file to edit any parameters manually
 
+```bash
+sweets config --print-empty
+```
 
 ## License
 

diff --git a/conda-env.yml b/conda-env.yml
@@ -5,16 +5,14 @@ dependencies:
   - python>=3.8
   - pip>=21.3  # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#editable-installation
   - git  # for pip install, due to setuptools_scm
-  - aria2 # ASF Downloading
+  - wget # ASF Downloading
   # - isce3>=0.14.0
   # - compass>=0.4.1
   # - dask>=2022.6.0
   # - dolphin>=0.2.0
   # - gdal>=3.5
-  # - geopandas-base>=0.12.0
   # - h5py>=3.6
   # - numpy>=1.20
-  # - pandas>=1.5
   # - pydantic>=2.1
   # - requests>=2.20
   # - rich>=12.0
@@ -28,16 +26,15 @@ dependencies:
   - dask
   - dolphin>=0.3.0
   - gdal
-  # - geopandas-base
   - h5py
-  # - matplotlib-base
   - numpy
-  # - pandas
   - pydantic>2.1
+  - pyproj>=3.3
+  - python-dateutil
+  - rasterio
   - requests
   - rich
   - rioxarray
-  - s1reader>=0.2.1
   - sardem
   - sentineleof
   - shapely
diff --git a/docker/specfile.txt b/docker/specfile.txt
diff --git a/src/sweets/_orbit.py b/src/sweets/_orbit.py
@@ -10,7 +10,7 @@
 
 def download_orbits(search_path: Path, save_dir: Path) -> List[Path]:
     """Download orbit files for a given search path."""
-    logger.debug(f"search_path: {search_path}, save_dir: {save_dir}")
+    logger.info(f"Orbit search_path: {search_path}, save_dir: {save_dir}")
     filenames = download.main(
         search_path=search_path,
         save_dir=save_dir,

diff --git a/src/sweets/core.py b/src/sweets/core.py
@@ -253,7 +253,7 @@ def _download_water_mask(self) -> Future:
             create_water_mask, self._water_mask_filename, self._dem_bbox
         )
 
-    def _download_rslcs(self) -> Future:
+    def _download_rslcs(self) -> list[Path]:
         """Download Sentinel zip files from ASF."""
         self.log_dir.mkdir(parents=True, exist_ok=True)
         # The final name will depend on if we're unzipping or not
@@ -264,18 +264,15 @@ def _download_rslcs(self) -> Future:
                 f"Found {len(existing_files)} existing files in"
                 f" {self.asf_query.out_dir}. Skipping download."
             )
-            return self._client.submit(lambda: existing_files)
+            return existing_files
 
         # If we didn't have any, we need to download them
         # TODO: how should we handle partial/failed downloads... do we really
         # want to re-search for them each time?
         # Maybe there can be a "force" flag to re-download everything?
         # or perhaps an API search, then if the number matches, we can skip
         # rather than let aria2c start and do the checksums
-        rslc_futures = self._client.submit(
-            self.asf_query.download, log_dir=self.log_dir
-        )
-        return rslc_futures
+        return self.asf_query.download(log_dir=self.log_dir)
 
     @log_runtime
     def _geocode_slcs(self, slc_files, dem_file, burst_db_file):
@@ -324,6 +321,12 @@ def cfg_to_filename(cfg_path: Path) -> str:
 
         # Get the first config file (by date) for each of the bursts.
         # We only need to create the static layers once per burst
+        def cfg_to_static_filename(cfg_path: Path) -> str:
+            # Convert the YAML filename to a .h5 filename with date switched
+            # geo_runconfig_20221029_t078_165578_iw3.yaml -> t078_165578_iw2_20221029.h5
+            burst = "_".join(cfg_path.stem.split("_")[3:])
+            return f"static_layers_{burst}.h5"
+
         existing_static_paths = self._get_burst_static_layers()
         name_to_paths = {p.name: p for p in existing_static_paths}
         logger.info(f"Found {len(name_to_paths)} existing geometry files")
@@ -333,13 +336,12 @@ def cfg_to_filename(cfg_path: Path) -> str:
         static_files = []
         todo_static = []
         for cfg_file in day1_cfg_paths:
-            name = cfg_to_filename(cfg_file)
+            name = cfg_to_static_filename(cfg_file)
             if name in name_to_paths:
                 static_files.append(name_to_paths[name])
             else:
                 # Run the geocoding if we dont have it already
                 todo_static.append(cfg_file)
-
         run_func = partial(run_static_layers, log_dir=self.log_dir)
         with ProcessPoolExecutor(max_workers=self.n_workers) as _client:
             new_files = _client.map(run_func, todo_static)
@@ -415,7 +417,7 @@ def _create_burst_interferograms(self, gslc_files):
         # Group the SLCs by burst:
         # {'t078_165573_iw2': [PosixPath('gslcs/t078_165573_iw2/20221029/...], 't078_...
         burst_to_gslc = group_by_burst(gslc_files)
-        burst_to_ifg = group_by_burst(self._get_existing_burst_ifgs())
+        burst_to_ifg = group_by_burst(self._get_existing_burst_ifgs(), minimum_images=1)
         ifg_path_list = []
         for burst, gslc_files in burst_to_gslc.items():
             subdatasets = [self._get_subdataset(f) for f in gslc_files]
@@ -543,20 +545,17 @@ def run(self, starting_step: int = 1):
                 dem_fut = self._download_dem()
                 burst_db_fut = self._download_burst_db()
                 water_mask_future = self._download_water_mask()
-                rslc_futures = self._download_rslcs()
-                orbits_future = self._client.submit(
-                    download_orbits, self.asf_query.out_dir, self.orbit_dir
-                )
                 # Gather the futures once everything is downloaded
                 burst_db_file = burst_db_fut.result()
                 dem_fut.result()
                 wait([water_mask_future])
-                wait([orbits_future])
-                rslc_files = rslc_futures.result()
+
+            rslc_files = self._download_rslcs()
 
         # Second step:
         if starting_step <= 2:
             burst_db_file = get_burst_db()
+            download_orbits(self.asf_query.out_dir, self.orbit_dir)
             rslc_files = self._get_existing_rslcs()
             self._geocode_slcs(rslc_files, self._dem_filename, burst_db_file)
 

diff --git a/src/sweets/download.py b/src/sweets/download.py
@@ -22,6 +22,7 @@
 import subprocess
 import sys
 import zipfile
+from concurrent.futures import ThreadPoolExecutor
 from datetime import date, datetime
 from functools import lru_cache
 from pathlib import Path
@@ -192,6 +193,20 @@ def _download_with_aria(self, urls, log_dir: Filename = Path(".")):
         with open(log_filename, "w") as f:
             subprocess.run(aria_cmd, shell=True, stdout=f, stderr=f, text=True)
 
+    def _download_with_wget(self, urls, log_dir: Filename = Path(".")):
+        def download_url(idx_url_pair):
+            idx, u = idx_url_pair
+            log_filename = Path(log_dir) / f"wget_{idx:02d}.log"
+            with open(log_filename, "w") as f:
+                wget_cmd = f'wget -nc -c "{u}" -P "{self.out_dir}"'
+                logger.info(f"({idx} / {len(urls)}): Downloading {u} with wget")
+                logger.info(wget_cmd)
+                subprocess.run(wget_cmd, shell=True, stdout=f, stderr=f, text=True)
+
+        # Parallelize the download using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            list(executor.map(download_url, enumerate(urls)))
+
     @log_runtime
     def download(self, log_dir: Filename = Path(".")) -> list[Path]:
         # Start by saving data available as geojson
@@ -206,8 +221,8 @@ def download(self, log_dir: Filename = Path(".")) -> list[Path]:
         self.out_dir.mkdir(parents=True, exist_ok=True)
         file_names = [self.out_dir / f for f in self._file_names(results)]
 
-        # NOTE: aria should skip already-downloaded files
-        self._download_with_aria(urls, log_dir=log_dir)
+        # TODO: use aria if available? or just make wget parallel...
+        self._download_with_wget(urls, log_dir=log_dir)
 
         if self.unzip:
             # Change to .SAFE extension

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -0,0 +1,27 @@
+from sweets.core import Workflow
+
+
+def test_workflow_construct(tmp_path):
+    bbox = [-102.2, 32.15, -102.1, 32.22]
+    # start, end, track = "2022-10-15", "2023-02-20", 78
+    start, end, track = "2022-12-15", "2022-12-29", 78
+    n_workers, tpw = 1, 16
+
+    w = Workflow(
+        asf_query=dict(
+            start=start,
+            end=end,
+            relativeOrbit=track,
+            out_dir="data",
+        ),
+        bbox=bbox,
+        n_workers=n_workers,
+        threads_per_worker=tpw,
+        max_bandwidth=1,
+        orbit_dir="orbits",
+    )
+    # print(w.run())  # Print out the final output results
+    outfile = tmp_path / "config.yaml"
+    w.to_yaml(outfile, with_comments=True)
+    w2 = Workflow.from_yaml(outfile)
+    assert w == w2