Restructure library into record and batch

earthobservations · Jan 8, 2024 · 245d7dc · 245d7dc
1 parent 2f30856
commit 245d7dc
Show file tree

Hide file tree

Showing 18 changed files with 339 additions and 346 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+
+- Restructure library into record and batch
+
 ## [0.2.1] - 2023-12-27
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -19,11 +19,21 @@ There is a simple command line interface.
 The `isd record` command prints a single record in JSON format:
 
 ```shell
-isd record 720538-00164-2021
+isd record tests/data/720538-00164-2021
 ```
 
 The Python API allows reading compressed and uncompressed ISD files:
 
+```python
+from isd import Batch
+
+batch = Batch.from_path("isd-file")
+for record in batch:
+    print(record)
+```
+
+Streaming is also supported:
+
 ```python
 import isd.io
 

diff --git a/docs/api/mod.rst b/docs/api/mod.rst
@@ -9,8 +9,8 @@ Most useful functions and classes are contained in submodules.
 
    errors
    io
-   pandas
    record
+   batch
 
 isd
 ---

diff --git a/docs/api/pandas.rst b/docs/api/pandas.rst
diff --git a/src/isd/__init__.py b/src/isd/__init__.py
@@ -1,4 +1,5 @@
 from isd.errors import IsdError
+from isd.batch import Batch
 from isd.record import Record
 
-__all__ = ["IsdError", "Record"]
+__all__ = ["IsdError", "Batch", "Record"]
diff --git a/src/isd/batch.py b/src/isd/batch.py
@@ -0,0 +1,87 @@
+import gzip
+import json
+from io import BytesIO
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Union, Optional, Dict, Any, Iterator
+import datetime as dt
+
+from isd.record import Record
+
+import pandas as pd
+
+
+@dataclass
+class Batch:
+    records: List[Record]
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def __getitem__(self, index: int) -> Record:
+        return self.records[index]
+
+    def __iter__(self) -> Iterator[Record]:
+        return iter(self.records)
+
+    @classmethod
+    def parse(cls, lines: Union[str, BytesIO]) -> "Batch":
+        """Reads records from a text io stream."""
+        if isinstance(lines, BytesIO):
+            lines = lines.read().decode("utf-8")
+        return cls([Record.parse(line) for line in lines.splitlines()])
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path]) -> "Batch":
+        """Opens a local ISD file and returns an iterator over its records.
+
+        If the path has a .gz extension, this function will assume it has gzip
+        compression and will attempt to open it using `gzip.open`.
+        """
+        path = Path(path)
+        if path.suffix == ".gz":
+            with gzip.open(path) as gzip_file:
+                return cls(
+                    [Record.parse(gzip_line.decode("utf-8")) for gzip_line in gzip_file]
+                )
+        else:
+            with open(path) as uncompressed_file:
+                return cls(
+                    [
+                        Record.parse(uncompressed_line)
+                        for uncompressed_line in uncompressed_file
+                    ]
+                )
+
+    def filter_by_datetime(
+        self,
+        start_date: Optional[dt.datetime] = None,
+        end_date: Optional[dt.datetime] = None,
+    ) -> "Batch":
+        """Returns an iterator over records filtered by start and end datetimes (both optional)."""
+        return Batch(
+            [
+                record
+                for record in self.records
+                if (not start_date or record.datetime() >= start_date)
+                and (not end_date or record.datetime() < end_date)
+            ]
+        )
+
+    def to_dict(self) -> List[Dict[str, Any]]:
+        """Returns a list of dictionaries, one for each record."""
+        return [record.to_dict() for record in self.records]
+
+    def to_json(self, indent: int = 4) -> str:
+        """Returns a JSON line of all records."""
+        data = []
+        for d in self.to_dict():
+            d["datetime"] = d["datetime"].isoformat()
+            data.append(d)
+        return json.dumps(data, indent=indent)
+
+    def to_df(self) -> pd.DataFrame:
+        """Reads a local ISD file into a DataFrame."""
+        import pandas as pd
+
+        return pd.DataFrame([record.to_dict() for record in self.records])
diff --git a/src/isd/cli.py b/src/isd/cli.py
@@ -1,13 +1,9 @@
 # type: ignore
 
-import dataclasses
-import itertools
-import json
-
 import click
 from click import ClickException
 
-import isd.io
+from isd.batch import Batch
 
 
 @click.group()
@@ -20,9 +16,9 @@ def main() -> None:
 @click.option("-i", "--index", default=0)
 def record(infile: str, index: int) -> None:
     """Prints a single record to standard output in JSON format."""
-    with isd.io.open(infile) as records:
-        record = next(itertools.islice(records, index, None), None)
-        if record:
-            print(json.dumps(dataclasses.asdict(record), indent=4))
-        else:
-            raise ClickException(f"No record with index {index}")
+    batch = Batch.from_path(infile)
+    try:
+        record_ = batch[index]
+        print(record_.to_json())
+    except IndexError:
+        raise ClickException(f"No record with index {index}")
diff --git a/src/isd/io.py b/src/isd/io.py
@@ -1,12 +1,9 @@
-import datetime
 import gzip
 import os.path
 from contextlib import contextmanager
-from typing import Generator, Iterable, Iterator, Optional, TextIO
+from typing import Generator, Iterable
 
-from pandas import DataFrame
 
-from . import pandas as isd_pandas
 from .record import Record
 
 builtin_open = open
@@ -28,21 +25,3 @@ def open(path: str) -> Generator[Iterable[Record], None, None]:
                 Record.parse(uncompressed_line)
                 for uncompressed_line in uncompressed_file
             )
-
-
-def from_text_io(text_io: TextIO) -> Iterator[Record]:
-    """Reads records from a text io stream."""
-    while True:
-        line = text_io.readline()
-        if not line:
-            break
-        else:
-            yield Record.parse(line)
-
-
-def read_to_data_frame(
-    path: str, since: Optional[datetime.datetime] = None
-) -> DataFrame:
-    """Reads a local ISD file into a DataFrame."""
-    with open(path) as file:
-        return isd_pandas.data_frame(file, since=since)
diff --git a/src/isd/pandas.py b/src/isd/pandas.py