Skip to content

Commit

Permalink
Restructure library into record and batch
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed Jan 17, 2024
1 parent 2f30856 commit 7f17f51
Show file tree
Hide file tree
Showing 19 changed files with 344 additions and 351 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added

- Restructure library into record and batch ([#36](https://github.com/gadomski/pyisd/pull/36))

## [0.2.1] - 2023-12-27

### Fixed
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,21 @@ There is a simple command line interface.
The `isd record` command prints a single record in JSON format:

```shell
isd record 720538-00164-2021
isd record tests/data/720538-00164-2021
```

The Python API allows reading compressed and uncompressed ISD files:

```python
from isd import Batch

batch = Batch.from_path("isd-file")
for record in batch:
print(record)
```

Streaming is also supported:

```python
import isd.io

Expand Down
2 changes: 1 addition & 1 deletion docs/api/mod.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ Most useful functions and classes are contained in submodules.

errors
io
pandas
record
batch

isd
---
Expand Down
5 changes: 0 additions & 5 deletions docs/api/pandas.rst

This file was deleted.

10 changes: 5 additions & 5 deletions examples/check_timesteps.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@

import tqdm

import isd.io
from isd import Batch

directory = sys.argv[1]
paths = [os.path.join(directory, file_name) for file_name in os.listdir(directory)]
all_monotonic = True
bad_paths = []
for path in tqdm.tqdm(paths):
data_frame = isd.io.read_to_data_frame(path)
min = data_frame.timestamp.min()
max = data_frame.timestamp.max()
is_monotonic = data_frame.timestamp.is_monotonic
data_frame = Batch.from_path(path).to_data_frame()
min = data_frame.datetime.min()
max = data_frame.datetime.max()
is_monotonic = data_frame.datetime.is_monotonic
if not is_monotonic:
all_monotonic = False
bad_paths.append(path)
Expand Down
3 changes: 2 additions & 1 deletion src/isd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from isd.errors import IsdError
from isd.batch import Batch
from isd.record import Record

__all__ = ["IsdError", "Record"]
__all__ = ["IsdError", "Batch", "Record"]
86 changes: 86 additions & 0 deletions src/isd/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations
import gzip
import json
from io import BytesIO
from pathlib import Path
from dataclasses import dataclass
from typing import List, Union, Optional, Dict, Any, Iterator
import datetime

from isd.record import Record

import pandas


@dataclass
class Batch:
records: List[Record]

def __len__(self) -> int:
return len(self.records)

def __getitem__(self, index: int) -> Record:
return self.records[index]

def __iter__(self) -> Iterator[Record]:
return iter(self.records)

@classmethod
def parse(cls, lines: Union[str, BytesIO]) -> Batch:
"""Reads records from a text io stream."""
if isinstance(lines, BytesIO):
lines = lines.read().decode("utf-8")
return cls([Record.parse(line) for line in lines.splitlines()])

@classmethod
def from_path(cls, path: Union[str, Path]) -> Batch:
"""Opens a local ISD file and returns an iterator over its records.
If the path has a .gz extension, this function will assume it has gzip
compression and will attempt to open it using `gzip.open`.
"""
path = Path(path)
if path.suffix == ".gz":
with gzip.open(path) as gzip_file:
return cls(
[Record.parse(gzip_line.decode("utf-8")) for gzip_line in gzip_file]
)
else:
with open(path) as uncompressed_file:
return cls(
[
Record.parse(uncompressed_line)
for uncompressed_line in uncompressed_file
]
)

def filter_by_datetime(
self,
start_date: Optional[datetime.datetime] = None,
end_date: Optional[datetime.datetime] = None,
) -> Batch:
"""Returns a new Batch with records filtered by start and end datetimes (both optional)."""
return Batch(
[
record
for record in self.records
if (not start_date or record.datetime() >= start_date)
and (not end_date or record.datetime() < end_date)
]
)

def to_dict(self) -> List[Dict[str, Any]]:
"""Returns a list of dictionaries, one for each record."""
return [record.to_dict() for record in self.records]

def to_json(self, indent: int = 4) -> str:
"""Returns a JSON line of all records."""
data = []
for d in self.to_dict():
d["timestamp"] = d["timestamp"].isoformat()
data.append(d)
return json.dumps(data, indent=indent)

def to_data_frame(self) -> pandas.DataFrame:
"""Reads a local ISD file into a DataFrame."""
return pandas.DataFrame([record.to_dict() for record in self.records])
18 changes: 7 additions & 11 deletions src/isd/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
# type: ignore

import dataclasses
import itertools
import json

import click
from click import ClickException

import isd.io
from isd.batch import Batch


@click.group()
Expand All @@ -20,9 +16,9 @@ def main() -> None:
@click.option("-i", "--index", default=0)
def record(infile: str, index: int) -> None:
"""Prints a single record to standard output in JSON format."""
with isd.io.open(infile) as records:
record = next(itertools.islice(records, index, None), None)
if record:
print(json.dumps(dataclasses.asdict(record), indent=4))
else:
raise ClickException(f"No record with index {index}")
batch = Batch.from_path(infile)
try:
record_ = batch[index]
print(record_.to_json())
except IndexError as e:
raise ClickException(f"No record with index {index}") from e
23 changes: 1 addition & 22 deletions src/isd/io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import datetime
import gzip
import os.path
from contextlib import contextmanager
from typing import Generator, Iterable, Iterator, Optional, TextIO
from typing import Generator, Iterable

from pandas import DataFrame

from . import pandas as isd_pandas
from .record import Record

builtin_open = open
Expand All @@ -28,21 +25,3 @@ def open(path: str) -> Generator[Iterable[Record], None, None]:
Record.parse(uncompressed_line)
for uncompressed_line in uncompressed_file
)


def from_text_io(text_io: TextIO) -> Iterator[Record]:
"""Reads records from a text io stream."""
while True:
line = text_io.readline()
if not line:
break
else:
yield Record.parse(line)


def read_to_data_frame(
path: str, since: Optional[datetime.datetime] = None
) -> DataFrame:
"""Reads a local ISD file into a DataFrame."""
with open(path) as file:
return isd_pandas.data_frame(file, since=since)
Loading

0 comments on commit 7f17f51

Please sign in to comment.