Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ProphetModelDataset #720

Merged
merged 19 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
| Type | Description | Location |
|-------------------------------------|-----------------------------------------------------------|-----------------------------------------|
| `pytorch.PyTorchDataset` | A dataset for securely saving and loading PyTorch models | `kedro_datasets_experimental.pytorch` |
| `prophet.ProphetModelDataset` | A dataset for Meta's Prophet model for time series forecasting | `kedro_datasets_experimental.prophet` |


* Added the following new core datasets:

Expand All @@ -24,6 +26,7 @@ Many thanks to the following Kedroids for contributing PRs to this release:
* [yury-fedotov](https://github.com/yury-fedotov)
* [gitgud5000](https://github.com/gitgud5000)
* [janickspirig](https://github.com/janickspirig)
* [Galen Seilis](https://github.com/galenseilis)


# Release 4.1.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ kedro_datasets_experimental
langchain.ChatOpenAIDataset
langchain.OpenAIEmbeddingsDataset
netcdf.NetCDFDataset
prophet.ProphetModelDataset
pytorch.PyTorchDataset
rioxarray.GeoTIFFDataset
2 changes: 2 additions & 0 deletions kedro-datasets/docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@
"xarray.core.dataset.Dataset",
"xarray.core.dataarray.DataArray",
"torch.nn.modules.module.Module",
"prophet.forecaster.Prophet",
"Prophet",
),
"py:data": (
"typing.Any",
Expand Down
11 changes: 11 additions & 0 deletions kedro-datasets/kedro_datasets_experimental/prophet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""``JSONDataset`` implementation to load/save data from/to a Prophet model file."""

from typing import Any

import lazy_loader as lazy

ProphetDataset: Any

__getattr__, __dir__, __all__ = lazy.attach(
__name__, submod_attrs={"prophet_dataset": ["ProphetModelDataset"]}
)
121 changes: 121 additions & 0 deletions kedro-datasets/kedro_datasets_experimental/prophet/prophet_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from __future__ import annotations

from typing import Any

from kedro.io.core import Version, get_filepath_str
from prophet import Prophet
from prophet.serialize import model_from_json, model_to_json

from kedro_datasets.json import JSONDataset


class ProphetModelDataset(JSONDataset):
"""``ProphetModelDataset`` loads/saves Facebook Prophet models to a JSON file using an
underlying filesystem (e.g., local, S3, GCS). It uses Prophet's built-in
serialization to handle the JSON file.

Example usage for the
`YAML API <https://kedro.readthedocs.io/en/stable/data/\
data_catalog_yaml_examples.html>`_:

.. code-block:: yaml

model:
type: custom_datasets.ProphetModelDataset
filepath: gcs://your_bucket/model.json
fs_args:
project: my-project
credentials: my_gcp_credentials

Example usage for the
`Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_:

.. code-block:: pycon

>>> from kedro_datasets_experimental.prophet import ProphetModelDataset
>>> from prophet import Prophet
>>> import pandas as pd
>>>
>>> df = pd.DataFrame({
>>> "ds": ["2024-01-01", "2024-01-02", "2024-01-03"],
>>> "y": [100, 200, 300]
>>> })
>>>
>>> model = Prophet()
>>> model.fit(df)
>>> dataset = ProphetModelDataset(filepath="path/to/model.json")
>>> dataset.save(model)
>>> reloaded_model = dataset.load()

"""

def __init__( # noqa: PLR0913
self,
*,
filepath: str,
save_args: dict[str, Any] | None = None,
version: Version | None = None,
credentials: dict[str, Any] | None = None,
fs_args: dict[str, Any] | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
"""Creates a new instance of ``ProphetModelDataset`` pointing to a concrete JSON file
on a specific filesystem.

Args:
filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.
If prefix is not provided, `file` protocol (local filesystem) will be used.
The prefix should be any protocol supported by ``fsspec``.
Note: `http(s)` doesn't support versioning.
save_args: json options for saving JSON files (arguments passed
into ```json.dump``). Here you can find all available arguments:
https://docs.python.org/3/library/json.html
All defaults are preserved, but "default_flow_style", which is set to False.
version: If specified, should be an instance of
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
credentials: Credentials required to get access to the underlying filesystem.
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
to pass to the filesystem's `open` method through nested keys
`open_args_load` and `open_args_save`.
Here you can find all available arguments for `open`:
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
super().__init__(
filepath=filepath,
save_args=save_args,
version=version,
credentials=credentials,
fs_args=fs_args,
metadata=metadata,
)

def _load(self) -> Prophet:
"""Loads a Prophet model from a JSON file.

Returns:
Prophet: A deserialized Prophet model.
"""
load_path = get_filepath_str(self._get_load_path(), self._protocol)

with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
return model_from_json(fs_file.read())

def _save(self, data: Prophet) -> None:
"""Saves a Prophet model to a JSON file.

Args:
data: The Prophet model instance to be serialized and saved.
"""
save_path = get_filepath_str(self._get_save_path(), self._protocol)

with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
fs_file.write(model_to_json(data))

self._invalidate_cache()
34 changes: 34 additions & 0 deletions kedro-datasets/kedro_datasets_experimental/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
This file contains the fixtures that are reusable by any tests within
this directory. You don't need to import the fixtures as pytest will
discover them automatically. More info here:
https://docs.pytest.org/en/latest/fixture.html
"""

from kedro.io.core import generate_timestamp
from pytest import fixture


@fixture(params=[None])
def load_version(request):
return request.param


@fixture(params=[None])
def save_version(request):
return request.param or generate_timestamp()


@fixture(params=[None])
def load_args(request):
return request.param


@fixture(params=[None])
def save_args(request):
return request.param


@fixture(params=[None])
def fs_args(request):
return request.param
Empty file.
Loading