Restructure library into record and batch

earthobservations · Dec 22, 2023 · 477669d · 477669d
1 parent db9e892
commit 477669d
Show file tree

Hide file tree

Showing 12 changed files with 223 additions and 407 deletions.
diff --git a/src/isd/batch.py b/src/isd/batch.py
@@ -0,0 +1,53 @@
+import gzip
+from io import BytesIO
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, TYPE_CHECKING, Union, Optional
+import datetime as dt
+
+from src.isd.record import Record
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+@dataclass
+class Batch:
+    records: List[Record]
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path]) -> "Batch":
+        """Opens a local ISD file and returns an iterator over its records.
+
+        If the path has a .gz extension, this function will assume it has gzip
+        compression and will attempt to open it using `gzip.open`.
+        """
+        path = Path(path)
+        if path.suffix == ".gz":
+            with gzip.open(path) as gzip_file:
+                return cls([Record.from_string(gzip_line.decode("utf-8")) for gzip_line in gzip_file])
+        else:
+            with open(path) as uncompressed_file:
+                return cls([Record.from_string(uncompressed_line) for uncompressed_line in uncompressed_file])
+
+    @classmethod
+    def from_string(cls, string: Union[str, BytesIO]) -> "Batch":
+        """Reads records from a text io stream."""
+        if isinstance(string, BytesIO):
+            string = string.read().decode("utf-8")
+        return cls([Record.from_string(line) for line in string.splitlines()])
+
+    def filter_by_datetime(self, start_date: Optional[dt.datetime] = None, end_date: Optional[dt.datetime] = None,
+    ) -> List[Record]:
+        """Returns an iterator over records filtered by start and end datetimes (both optional)."""
+        return [
+            record
+            for record in self.records
+            if (not start_date or record.datetime() >= start_date)
+               and (not end_date or record.datetime() < end_date)
+        ]
+
+    def to_df(self) -> "pd.DataFrame":
+        """Reads a local ISD file into a DataFrame."""
+        import pandas as pd
+        return pd.DataFrame([record.to_dict() for record in self.records])
diff --git a/src/isd/cli.py b/src/isd/cli.py
@@ -1,13 +1,9 @@
 # type: ignore
 
-import dataclasses
-import itertools
-import json
-
 import click
 from click import ClickException
 
-import isd.io
+from src.isd.batch import Batch
 
 
 @click.group()
@@ -20,9 +16,9 @@ def main() -> None:
 @click.option("-i", "--index", default=0)
 def record(infile: str, index: int) -> None:
     """Prints a single record to standard output in JSON format."""
-    with isd.io.open(infile) as records:
-        record = next(itertools.islice(records, index, None), None)
-        if record:
-            print(json.dumps(dataclasses.asdict(record), indent=4))
-        else:
-            raise ClickException(f"No record with index {index}")
+    batch = Batch.from_path(infile)
+    try:
+        record_ = batch.records[index]
+        print(record_.to_json())
+    except IndexError:
+        raise ClickException(f"No record with index {index}")
diff --git a/src/isd/io.py b/src/isd/io.py
diff --git a/src/isd/pandas.py b/src/isd/pandas.py