Restructure library into record and batch

earthobservations · Jan 17, 2024 · 7f17f51 · 7f17f51
1 parent 2f30856
commit 7f17f51
Show file tree

Hide file tree

Showing 19 changed files with 344 additions and 351 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+
+- Restructure library into record and batch ([#36](https://github.com/gadomski/pyisd/pull/36))
+
 ## [0.2.1] - 2023-12-27
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -19,11 +19,21 @@ There is a simple command line interface.
 The `isd record` command prints a single record in JSON format:
 
 ```shell
-isd record 720538-00164-2021
+isd record tests/data/720538-00164-2021
 ```
 
 The Python API allows reading compressed and uncompressed ISD files:
 
+```python
+from isd import Batch
+
+batch = Batch.from_path("isd-file")
+for record in batch:
+    print(record)
+```
+
+Streaming is also supported:
+
 ```python
 import isd.io
 

diff --git a/docs/api/mod.rst b/docs/api/mod.rst
@@ -9,8 +9,8 @@ Most useful functions and classes are contained in submodules.
 
    errors
    io
-   pandas
    record
+   batch
 
 isd
 ---

diff --git a/docs/api/pandas.rst b/docs/api/pandas.rst
diff --git a/examples/check_timesteps.py b/examples/check_timesteps.py
@@ -5,17 +5,17 @@
 
 import tqdm
 
-import isd.io
+from isd import Batch
 
 directory = sys.argv[1]
 paths = [os.path.join(directory, file_name) for file_name in os.listdir(directory)]
 all_monotonic = True
 bad_paths = []
 for path in tqdm.tqdm(paths):
-    data_frame = isd.io.read_to_data_frame(path)
-    min = data_frame.timestamp.min()
-    max = data_frame.timestamp.max()
-    is_monotonic = data_frame.timestamp.is_monotonic
+    data_frame = Batch.from_path(path).to_data_frame()
+    min = data_frame.datetime.min()
+    max = data_frame.datetime.max()
+    is_monotonic = data_frame.datetime.is_monotonic
     if not is_monotonic:
         all_monotonic = False
         bad_paths.append(path)

diff --git a/src/isd/__init__.py b/src/isd/__init__.py
@@ -1,4 +1,5 @@
 from isd.errors import IsdError
+from isd.batch import Batch
 from isd.record import Record
 
-__all__ = ["IsdError", "Record"]
+__all__ = ["IsdError", "Batch", "Record"]
diff --git a/src/isd/batch.py b/src/isd/batch.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+import gzip
+import json
+from io import BytesIO
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Union, Optional, Dict, Any, Iterator
+import datetime
+
+from isd.record import Record
+
+import pandas
+
+
+@dataclass
+class Batch:
+    records: List[Record]
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def __getitem__(self, index: int) -> Record:
+        return self.records[index]
+
+    def __iter__(self) -> Iterator[Record]:
+        return iter(self.records)
+
+    @classmethod
+    def parse(cls, lines: Union[str, BytesIO]) -> Batch:
+        """Reads records from a text io stream."""
+        if isinstance(lines, BytesIO):
+            lines = lines.read().decode("utf-8")
+        return cls([Record.parse(line) for line in lines.splitlines()])
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path]) -> Batch:
+        """Opens a local ISD file and returns an iterator over its records.
+
+        If the path has a .gz extension, this function will assume it has gzip
+        compression and will attempt to open it using `gzip.open`.
+        """
+        path = Path(path)
+        if path.suffix == ".gz":
+            with gzip.open(path) as gzip_file:
+                return cls(
+                    [Record.parse(gzip_line.decode("utf-8")) for gzip_line in gzip_file]
+                )
+        else:
+            with open(path) as uncompressed_file:
+                return cls(
+                    [
+                        Record.parse(uncompressed_line)
+                        for uncompressed_line in uncompressed_file
+                    ]
+                )
+
+    def filter_by_datetime(
+        self,
+        start_date: Optional[datetime.datetime] = None,
+        end_date: Optional[datetime.datetime] = None,
+    ) -> Batch:
+        """Returns a new Batch with records filtered by start and end datetimes (both optional)."""
+        return Batch(
+            [
+                record
+                for record in self.records
+                if (not start_date or record.datetime() >= start_date)
+                and (not end_date or record.datetime() < end_date)
+            ]
+        )
+
+    def to_dict(self) -> List[Dict[str, Any]]:
+        """Returns a list of dictionaries, one for each record."""
+        return [record.to_dict() for record in self.records]
+
+    def to_json(self, indent: int = 4) -> str:
+        """Returns a JSON line of all records."""
+        data = []
+        for d in self.to_dict():
+            d["timestamp"] = d["timestamp"].isoformat()
+            data.append(d)
+        return json.dumps(data, indent=indent)
+
+    def to_data_frame(self) -> pandas.DataFrame:
+        """Reads a local ISD file into a DataFrame."""
+        return pandas.DataFrame([record.to_dict() for record in self.records])
diff --git a/src/isd/cli.py b/src/isd/cli.py
@@ -1,13 +1,9 @@
 # type: ignore
 
-import dataclasses
-import itertools
-import json
-
 import click
 from click import ClickException
 
-import isd.io
+from isd.batch import Batch
 
 
 @click.group()
@@ -20,9 +16,9 @@ def main() -> None:
 @click.option("-i", "--index", default=0)
 def record(infile: str, index: int) -> None:
     """Prints a single record to standard output in JSON format."""
-    with isd.io.open(infile) as records:
-        record = next(itertools.islice(records, index, None), None)
-        if record:
-            print(json.dumps(dataclasses.asdict(record), indent=4))
-        else:
-            raise ClickException(f"No record with index {index}")
+    batch = Batch.from_path(infile)
+    try:
+        record_ = batch[index]
+        print(record_.to_json())
+    except IndexError as e:
+        raise ClickException(f"No record with index {index}") from e
diff --git a/src/isd/io.py b/src/isd/io.py
@@ -1,12 +1,9 @@
-import datetime
 import gzip
 import os.path
 from contextlib import contextmanager
-from typing import Generator, Iterable, Iterator, Optional, TextIO
+from typing import Generator, Iterable
 
-from pandas import DataFrame
 
-from . import pandas as isd_pandas
 from .record import Record
 
 builtin_open = open
@@ -28,21 +25,3 @@ def open(path: str) -> Generator[Iterable[Record], None, None]:
                 Record.parse(uncompressed_line)
                 for uncompressed_line in uncompressed_file
             )
-
-
-def from_text_io(text_io: TextIO) -> Iterator[Record]:
-    """Reads records from a text io stream."""
-    while True:
-        line = text_io.readline()
-        if not line:
-            break
-        else:
-            yield Record.parse(line)
-
-
-def read_to_data_frame(
-    path: str, since: Optional[datetime.datetime] = None
-) -> DataFrame:
-    """Reads a local ISD file into a DataFrame."""
-    with open(path) as file:
-        return isd_pandas.data_frame(file, since=since)