Skip to content

Commit

Permalink
Restructure library into record and batch
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed Jan 8, 2024
1 parent 2f30856 commit 245d7dc
Show file tree
Hide file tree
Showing 18 changed files with 339 additions and 346 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added

- Restructure library into record and batch

## [0.2.1] - 2023-12-27

### Fixed
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,21 @@ There is a simple command line interface.
The `isd record` command prints a single record in JSON format:

```shell
isd record 720538-00164-2021
isd record tests/data/720538-00164-2021
```

The Python API allows reading compressed and uncompressed ISD files:

```python
from isd import Batch

batch = Batch.from_path("isd-file")
for record in batch:
print(record)
```

Streaming is also supported:

```python
import isd.io

Expand Down
2 changes: 1 addition & 1 deletion docs/api/mod.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ Most useful functions and classes are contained in submodules.

errors
io
pandas
record
batch

isd
---
Expand Down
5 changes: 0 additions & 5 deletions docs/api/pandas.rst

This file was deleted.

3 changes: 2 additions & 1 deletion src/isd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from isd.errors import IsdError
from isd.batch import Batch
from isd.record import Record

__all__ = ["IsdError", "Record"]
__all__ = ["IsdError", "Batch", "Record"]
87 changes: 87 additions & 0 deletions src/isd/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import gzip
import json
from io import BytesIO
from pathlib import Path
from dataclasses import dataclass
from typing import List, Union, Optional, Dict, Any, Iterator
import datetime as dt

from isd.record import Record

import pandas as pd


@dataclass
class Batch:
records: List[Record]

def __len__(self) -> int:
return len(self.records)

def __getitem__(self, index: int) -> Record:
return self.records[index]

def __iter__(self) -> Iterator[Record]:
return iter(self.records)

@classmethod
def parse(cls, lines: Union[str, BytesIO]) -> "Batch":
"""Reads records from a text io stream."""
if isinstance(lines, BytesIO):
lines = lines.read().decode("utf-8")
return cls([Record.parse(line) for line in lines.splitlines()])

@classmethod
def from_path(cls, path: Union[str, Path]) -> "Batch":
"""Opens a local ISD file and returns an iterator over its records.
If the path has a .gz extension, this function will assume it has gzip
compression and will attempt to open it using `gzip.open`.
"""
path = Path(path)
if path.suffix == ".gz":
with gzip.open(path) as gzip_file:
return cls(
[Record.parse(gzip_line.decode("utf-8")) for gzip_line in gzip_file]
)
else:
with open(path) as uncompressed_file:
return cls(
[
Record.parse(uncompressed_line)
for uncompressed_line in uncompressed_file
]
)

def filter_by_datetime(
self,
start_date: Optional[dt.datetime] = None,
end_date: Optional[dt.datetime] = None,
) -> "Batch":
"""Returns an iterator over records filtered by start and end datetimes (both optional)."""
return Batch(
[
record
for record in self.records
if (not start_date or record.datetime() >= start_date)
and (not end_date or record.datetime() < end_date)
]
)

def to_dict(self) -> List[Dict[str, Any]]:
"""Returns a list of dictionaries, one for each record."""
return [record.to_dict() for record in self.records]

def to_json(self, indent: int = 4) -> str:
"""Returns a JSON line of all records."""
data = []
for d in self.to_dict():
d["datetime"] = d["datetime"].isoformat()
data.append(d)
return json.dumps(data, indent=indent)

def to_df(self) -> pd.DataFrame:
"""Reads a local ISD file into a DataFrame."""
import pandas as pd

return pd.DataFrame([record.to_dict() for record in self.records])
18 changes: 7 additions & 11 deletions src/isd/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
# type: ignore

import dataclasses
import itertools
import json

import click
from click import ClickException

import isd.io
from isd.batch import Batch


@click.group()
Expand All @@ -20,9 +16,9 @@ def main() -> None:
@click.option("-i", "--index", default=0)
def record(infile: str, index: int) -> None:
"""Prints a single record to standard output in JSON format."""
with isd.io.open(infile) as records:
record = next(itertools.islice(records, index, None), None)
if record:
print(json.dumps(dataclasses.asdict(record), indent=4))
else:
raise ClickException(f"No record with index {index}")
batch = Batch.from_path(infile)
try:
record_ = batch[index]
print(record_.to_json())
except IndexError:
raise ClickException(f"No record with index {index}")
23 changes: 1 addition & 22 deletions src/isd/io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import datetime
import gzip
import os.path
from contextlib import contextmanager
from typing import Generator, Iterable, Iterator, Optional, TextIO
from typing import Generator, Iterable

from pandas import DataFrame

from . import pandas as isd_pandas
from .record import Record

builtin_open = open
Expand All @@ -28,21 +25,3 @@ def open(path: str) -> Generator[Iterable[Record], None, None]:
Record.parse(uncompressed_line)
for uncompressed_line in uncompressed_file
)


def from_text_io(text_io: TextIO) -> Iterator[Record]:
"""Reads records from a text io stream."""
while True:
line = text_io.readline()
if not line:
break
else:
yield Record.parse(line)


def read_to_data_frame(
path: str, since: Optional[datetime.datetime] = None
) -> DataFrame:
"""Reads a local ISD file into a DataFrame."""
with open(path) as file:
return isd_pandas.data_frame(file, since=since)
165 changes: 0 additions & 165 deletions src/isd/pandas.py

This file was deleted.

Loading

0 comments on commit 245d7dc

Please sign in to comment.