Restructure library into record and batch

earthobservations · Dec 28, 2023 · f7aec1d · f7aec1d
1 parent 2f30856
commit f7aec1d
Show file tree

Hide file tree

Showing 23 changed files with 451 additions and 435 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+
+- Restructure library into record and batch
+
 ## [0.2.1] - 2023-12-27
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -19,11 +19,21 @@ There is a simple command line interface.
 The `isd record` command prints a single record in JSON format:
 
 ```shell
-isd record 720538-00164-2021
+isd record tests/data/720538-00164-2021
 ```
 
 The Python API allows reading compressed and uncompressed ISD files:
 
+```python
+from isd import Batch
+
+batch = Batch.from_path("isd-file")
+for record in batch:
+    print(record)
+```
+
+Streaming is also supported:
+
 ```python
 import isd.io
 

diff --git a/docs/api/batch.rst b/docs/api/batch.rst
@@ -0,0 +1,5 @@
+batch
+=====
+
+.. automodule:: isd.batch
+    :members:
diff --git a/docs/api/errors.rst b/docs/api/errors.rst
@@ -1,5 +1,5 @@
-isd.errors
-==========
+errors
+======
 
 .. automodule:: isd.errors
     :members:
diff --git a/docs/api/io.rst b/docs/api/io.rst
@@ -1,5 +1,5 @@
-isd.io
-======
+io
+==
 
 .. automodule:: isd.io
     :members:
diff --git a/docs/api/mod.rst b/docs/api/mod.rst
@@ -9,8 +9,8 @@ Most useful functions and classes are contained in submodules.
 
    errors
    io
-   pandas
    record
+   batch
 
 isd
 ---

diff --git a/docs/api/pandas.rst b/docs/api/pandas.rst
diff --git a/docs/api/record.rst b/docs/api/record.rst
@@ -1,5 +1,5 @@
-isd.record
-==========
+record
+======
 
 .. automodule:: isd.record
     :members:
diff --git a/examples/check_timesteps.py b/examples/check_timesteps.py
@@ -1,30 +1,44 @@
 """Given a directory of ISD files, checks that all timesteps monotonically increase."""
-
-import os
+import logging
+from pathlib import Path
 import sys
+from typing import Union
 
 import tqdm
 
-import isd.io
-
-directory = sys.argv[1]
-paths = [os.path.join(directory, file_name) for file_name in os.listdir(directory)]
-all_monotonic = True
-bad_paths = []
-for path in tqdm.tqdm(paths):
-    data_frame = isd.io.read_to_data_frame(path)
-    min = data_frame.timestamp.min()
-    max = data_frame.timestamp.max()
-    is_monotonic = data_frame.timestamp.is_monotonic
-    if not is_monotonic:
-        all_monotonic = False
-        bad_paths.append(path)
-    tqdm.tqdm.write(f"{path}: min={min}, max={max}, is_monotonic={is_monotonic}")
-
-if all_monotonic:
-    print("All files have monotonically increasing timestamps!")
-else:
-    print("Not all files have monotonically increasing timestamps, here they are:")
-    for path in bad_paths:
-        print(f"    - {path}")
-    sys.exit(1)
+from isd import Batch
+
+logging.basicConfig(level=logging.INFO)
+
+log = logging.getLogger(__name__)
+
+
+def main(path: Union[str, Path]) -> None:
+    path = Path(path)
+    file_names = list(path.glob("*"))
+    all_monotonic = True
+    bad_files = []
+    for file_name in tqdm.tqdm(file_names):
+        df = Batch.from_path(file_name).to_df()
+        ts_min = df.datetime.min()
+        ts_max = df.datetime.max()
+        is_monotonic = df.datetime.is_monotonic_increasing
+        if not is_monotonic:
+            all_monotonic = False
+            bad_files.append(file_name)
+        log.info(
+            f"{file_name}: min={ts_min}, max={ts_max}, is_monotonic={is_monotonic}"
+        )
+
+    if all_monotonic:
+        print("All files have monotonically increasing timestamps!")
+    else:
+        print("Not all files have monotonically increasing timestamps, here they are:")
+        for file_name in bad_files:
+            print(f"    - {file_name}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    directory = sys.argv[1]
+    main(directory)
diff --git a/src/isd/__init__.py b/src/isd/__init__.py
@@ -1,4 +1,5 @@
 from isd.errors import IsdError
+from isd.batch import Batch
 from isd.record import Record
 
-__all__ = ["IsdError", "Record"]
+__all__ = ["IsdError", "Batch", "Record"]
diff --git a/src/isd/batch.py b/src/isd/batch.py
@@ -0,0 +1,90 @@
+import gzip
+import json
+from io import BytesIO
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Union, Optional, Dict, Any, Iterator
+import datetime as dt
+
+from isd.record import Record
+
+import pandas as pd
+
+
+@dataclass
+class Batch:
+    records: List[Record]
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def __getitem__(self, index: int) -> Record:
+        return self.records[index]
+
+    def __iter__(self) -> Iterator[Record]:
+        return iter(self.records)
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path]) -> "Batch":
+        """Opens a local ISD file and returns an iterator over its records.
+
+        If the path has a .gz extension, this function will assume it has gzip
+        compression and will attempt to open it using `gzip.open`.
+        """
+        path = Path(path)
+        if path.suffix == ".gz":
+            with gzip.open(path) as gzip_file:
+                return cls(
+                    [
+                        Record.from_string(gzip_line.decode("utf-8"))
+                        for gzip_line in gzip_file
+                    ]
+                )
+        else:
+            with open(path) as uncompressed_file:
+                return cls(
+                    [
+                        Record.from_string(uncompressed_line)
+                        for uncompressed_line in uncompressed_file
+                    ]
+                )
+
+    @classmethod
+    def from_string(cls, string: Union[str, BytesIO]) -> "Batch":
+        """Reads records from a text io stream."""
+        if isinstance(string, BytesIO):
+            string = string.read().decode("utf-8")
+        return cls([Record.from_string(line) for line in string.splitlines()])
+
+    def filter_by_datetime(
+        self,
+        start_date: Optional[dt.datetime] = None,
+        end_date: Optional[dt.datetime] = None,
+    ) -> "Batch":
+        """Returns an iterator over records filtered by start and end datetimes (both optional)."""
+        return Batch(
+            [
+                record
+                for record in self.records
+                if (not start_date or record.datetime >= start_date)
+                and (not end_date or record.datetime < end_date)
+            ]
+        )
+
+    def to_dict(self) -> List[Dict[str, Any]]:
+        """Returns a list of dictionaries, one for each record."""
+        return [record.to_dict() for record in self.records]
+
+    def to_json(self, indent: int = 4) -> str:
+        """Returns a JSON string of all records."""
+        data = []
+        for d in self.to_dict():
+            d["datetime"] = d["datetime"].isoformat()
+            data.append(d)
+        return json.dumps(data, indent=indent)
+
+    def to_df(self) -> pd.DataFrame:
+        """Reads a local ISD file into a DataFrame."""
+        import pandas as pd
+
+        return pd.DataFrame([record.to_dict() for record in self.records])
diff --git a/src/isd/cli.py b/src/isd/cli.py
@@ -1,13 +1,9 @@
 # type: ignore
 
-import dataclasses
-import itertools
-import json
-
 import click
 from click import ClickException
 
-import isd.io
+from isd.batch import Batch
 
 
 @click.group()
@@ -20,9 +16,9 @@ def main() -> None:
 @click.option("-i", "--index", default=0)
 def record(infile: str, index: int) -> None:
     """Prints a single record to standard output in JSON format."""
-    with isd.io.open(infile) as records:
-        record = next(itertools.islice(records, index, None), None)
-        if record:
-            print(json.dumps(dataclasses.asdict(record), indent=4))
-        else:
-            raise ClickException(f"No record with index {index}")
+    batch = Batch.from_path(infile)
+    try:
+        record_ = batch[index]
+        print(record_.to_json())
+    except IndexError:
+        raise ClickException(f"No record with index {index}")
diff --git a/src/isd/io.py b/src/isd/io.py
@@ -1,12 +1,9 @@
-import datetime
 import gzip
 import os.path
 from contextlib import contextmanager
-from typing import Generator, Iterable, Iterator, Optional, TextIO
+from typing import Generator, Iterable
 
-from pandas import DataFrame
 
-from . import pandas as isd_pandas
 from .record import Record
 
 builtin_open = open
@@ -21,28 +18,12 @@ def open(path: str) -> Generator[Iterable[Record], None, None]:
     """
     if os.path.splitext(path)[1] == ".gz":
         with gzip.open(path) as gzip_file:
-            yield (Record.parse(gzip_line.decode("utf-8")) for gzip_line in gzip_file)
+            yield (
+                Record.from_string(gzip_line.decode("utf-8")) for gzip_line in gzip_file
+            )
     else:
         with builtin_open(path) as uncompressed_file:
             yield (
-                Record.parse(uncompressed_line)
+                Record.from_string(uncompressed_line)
                 for uncompressed_line in uncompressed_file
             )
-
-
-def from_text_io(text_io: TextIO) -> Iterator[Record]:
-    """Reads records from a text io stream."""
-    while True:
-        line = text_io.readline()
-        if not line:
-            break
-        else:
-            yield Record.parse(line)
-
-
-def read_to_data_frame(
-    path: str, since: Optional[datetime.datetime] = None
-) -> DataFrame:
-    """Reads a local ISD file into a DataFrame."""
-    with open(path) as file:
-        return isd_pandas.data_frame(file, since=since)