From 3d36e57992480ee0812bfbee3e6a06f129f7ac9e Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Fri, 11 Oct 2024 03:22:02 -0600 Subject: [PATCH] feat(datasets): create separate `ibis.FileDataset` (#842) * feat(datasets): create separate `ibis.FileDataset` Signed-off-by: Deepyaman Datta * chore(datasets): deprecate `TableDataset` file I/O Signed-off-by: Deepyaman Datta * feat(datasets): implement `FileDataset` versioning Signed-off-by: Deepyaman Datta * chore(datasets): try `os.path.exists`, for Windows Signed-off-by: Deepyaman Datta * revert(datasets): use pathlib, ignore Windows test Refs: b7ff0c7 Signed-off-by: Deepyaman Datta * docs(datasets): add `ibis.FileDataset` to contents Signed-off-by: Deepyaman Datta * chore(datasets): add docstring for `hashable` func Signed-off-by: Deepyaman Datta * chore(datasets): add docstring for `hashable` func Signed-off-by: Deepyaman Datta * feat(datasets)!: expose `load` and `save` publicly Signed-off-by: Deepyaman Datta * chore(datasets): remove second filepath assignment Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta --- kedro-datasets/RELEASE.md | 376 ++++++++++-------- .../docs/source/api/kedro_datasets.rst | 1 + .../kedro_datasets/ibis/__init__.py | 4 +- .../kedro_datasets/ibis/file_dataset.py | 195 +++++++++ .../kedro_datasets/ibis/table_dataset.py | 29 +- .../tests/ibis/test_file_dataset.py | 274 +++++++++++++ 6 files changed, 706 insertions(+), 173 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/ibis/file_dataset.py create mode 100644 kedro-datasets/tests/ibis/test_file_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index bc420e2e9..d90898153 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,356 +1,414 @@ # Upcoming Release +## Major features and improvements + +- Added the following new core datasets: + +| Type | Description | Location | +| ------------------- | ------------------------------------------------------------- | --------------------- | +| `ibis.TableDataset` | A dataset for loading and saving files using Ibis's backends. | `kedro_datasets.ibis` | # Release 5.0.0 + ## Major features and improvements -* Removed support for Python 3.9 -* Added the following new **experimental** datasets: -| Type | Description | Location | -|-------------------------------------|-----------------------------------------------------------|-----------------------------------------| -| `pytorch.PyTorchDataset` | A dataset for securely saving and loading PyTorch models | `kedro_datasets_experimental.pytorch` | -| `prophet.ProphetModelDataset` | A dataset for Meta's Prophet model for time series forecasting | `kedro_datasets_experimental.prophet` | +- Removed support for Python 3.9. +- Added the following new **experimental** datasets: +| Type | Description | Location | +| ----------------------------- | --------------------------------------------------------------- | ------------------------------------- | +| `pytorch.PyTorchDataset` | A dataset for securely saving and loading PyTorch models. | `kedro_datasets_experimental.pytorch` | +| `prophet.ProphetModelDataset` | A dataset for Meta's Prophet model for time series forecasting. | `kedro_datasets_experimental.prophet` | -* Added the following new core datasets: +- Added the following new core datasets: -| Type | Description | Location | -|----------------------|------------------------------------------------|-------------------------| -| `plotly.HTMLDataset` | A dataset for saving a `plotly` figure as HTML | `kedro_datasets.plotly` | +| Type | Description | Location | +| -------------------- | ----------------------------------------------- | ----------------------- | +| `plotly.HTMLDataset` | A dataset for saving a `plotly` figure as HTML. | `kedro_datasets.plotly` | ## Bug fixes and other changes -* Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. -* Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`. -* Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib -* Fixed incorrect `pandas` optional dependency + +- Refactored all datasets to set `fs_args` defaults in the same way as `load_args` and `save_args` and not have hardcoded values in the save methods. +- Fixed bug related to loading/saving models from/to remote storage using `TensorFlowModelDataset`. +- Fixed deprecated load and save approaches of `GBQTableDataset` and `GBQQueryDataset` by invoking save and load directly over `pandas-gbq` lib. +- Fixed incorrect `pandas` optional dependency. ## Breaking Changes -* Exposed `load` and `save` publicly for each dataset. This requires Kedro version 0.19.7 or higher. -* Replaced the `geopandas.GeoJSONDataset` with `geopandas.GenericDataset` to support parquet and feather file formats. + +- Exposed `load` and `save` publicly for each dataset. This requires Kedro version 0.19.7 or higher. +- Replaced the `geopandas.GeoJSONDataset` with `geopandas.GenericDataset` to support parquet and feather file formats. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Brandon Meek](https://github.com/bpmeek) -* [yury-fedotov](https://github.com/yury-fedotov) -* [gitgud5000](https://github.com/gitgud5000) -* [janickspirig](https://github.com/janickspirig) -* [Galen Seilis](https://github.com/galenseilis) -* [Mariusz Wojakowski](https://github.com/mariusz89016) -* [harm-matthias-harms](https://github.com/harm-matthias-harms) -* [Felix Scherz](https://github.com/felixscherz) +- [Brandon Meek](https://github.com/bpmeek) +- [yury-fedotov](https://github.com/yury-fedotov) +- [gitgud5000](https://github.com/gitgud5000) +- [janickspirig](https://github.com/janickspirig) +- [Galen Seilis](https://github.com/galenseilis) +- [Mariusz Wojakowski](https://github.com/mariusz89016) +- [harm-matthias-harms](https://github.com/harm-matthias-harms) +- [Felix Scherz](https://github.com/felixscherz) # Release 4.1.0 + ## Major features and improvements -* Improved `partitions.PartitionedDataset` representation when printing. + +- Improved `partitions.PartitionedDataset` representation when printing. ## Bug fixes and other changes -* Updated `ibis.TableDataset` to make sure credentials are not printed in interactive environment. + +- Updated `ibis.TableDataset` to make sure credentials are not printed in interactive environment. ## Breaking Changes -## Community contributions +## Community contributions # Release 4.0.0 + ## Major features and improvements -* Added the following new **experimental** datasets: +- Added the following new **experimental** datasets: | Type | Description | Location | -|-------------------------------------|-----------------------------------------------------------|-----------------------------------------| +| ----------------------------------- | --------------------------------------------------------- | --------------------------------------- | | `langchain.ChatAnthropicDataset` | A dataset for loading a ChatAnthropic langchain model. | `kedro_datasets_experimental.langchain` | | `langchain.ChatCohereDataset` | A dataset for loading a ChatCohere langchain model. | `kedro_datasets_experimental.langchain` | | `langchain.OpenAIEmbeddingsDataset` | A dataset for loading a OpenAIEmbeddings langchain model. | `kedro_datasets_experimental.langchain` | | `langchain.ChatOpenAIDataset` | A dataset for loading a ChatOpenAI langchain model. | `kedro_datasets_experimental.langchain` | | `rioxarray.GeoTIFFDataset` | A dataset for loading and saving geotiff raster data | `kedro_datasets_experimental.rioxarray` | -| `netcdf.NetCDFDataset` | A dataset for loading and saving "*.nc" files. | `kedro_datasets_experimental.netcdf` | +| `netcdf.NetCDFDataset` | A dataset for loading and saving "\*.nc" files. | `kedro_datasets_experimental.netcdf` | -* Added the following new core datasets: +- Added the following new core datasets: -| Type | Description | Location | -|-------------------------------------|-----------------------------------------------------------|-----------------------------------------| -| `dask.CSVDataset` | A dataset for loading a CSV files using `dask` | `kedro_datasets.dask` | +| Type | Description | Location | +| ----------------- | ---------------------------------------------- | --------------------- | +| `dask.CSVDataset` | A dataset for loading a CSV files using `dask` | `kedro_datasets.dask` | -* Extended preview feature to `yaml.YAMLDataset`. +- Extended preview feature to `yaml.YAMLDataset`. ## Bug fixes and other changes -* Added `metadata` parameter for a few datasets + +- Added `metadata` parameter for a few datasets ## Breaking Changes -* `netcdf.NetCDFDataset` moved from `kedro_datasets` to `kedro_datasets_experimental`. + +- `netcdf.NetCDFDataset` moved from `kedro_datasets` to `kedro_datasets_experimental`. ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: -* [Ian Whalen](https://github.com/ianwhale) -* [Charles Guan](https://github.com/charlesbmi) -* [Thomas Gölles](https://github.com/tgoelles) -* [Lukas Innig](https://github.com/derluke) -* [Michael Sexton](https://github.com/michaelsexton) -* [michal-mmm](https://github.com/michal-mmm) - - +- [Ian Whalen](https://github.com/ianwhale) +- [Charles Guan](https://github.com/charlesbmi) +- [Thomas Gölles](https://github.com/tgoelles) +- [Lukas Innig](https://github.com/derluke) +- [Michael Sexton](https://github.com/michaelsexton) +- [michal-mmm](https://github.com/michal-mmm) # Release 3.0.1 ## Bug fixes and other changes -* Removed arbitrary upper bound for `s3fs`. -* `NetCDFDataset` support for NetCDF4 via `engine="netcdf4"` and `engine="h5netcdf"` + +- Removed arbitrary upper bound for `s3fs`. +- Added support for NetCDF4 via `engine="netcdf4"` and `engine="h5netcdf"` to `netcdf.NetCDFDataset`. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Charles Guan](https://github.com/charlesbmi) + +- [Charles Guan](https://github.com/charlesbmi) # Release 3.0.0 + ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: -| Type | Description | Location | -|-------------------------|-----------------------------------------------------------|-------------------------| -| `netcdf.NetCDFDataset` | A dataset for loading and saving `*.nc` files. | `kedro_datasets.netcdf` | -| `ibis.TableDataset` | A dataset for loading and saving using Ibis's backends. | `kedro_datasets.ibis` | +| Type | Description | Location | +| ---------------------- | ------------------------------------------------------- | ----------------------- | +| `netcdf.NetCDFDataset` | A dataset for loading and saving `*.nc` files. | `kedro_datasets.netcdf` | +| `ibis.TableDataset` | A dataset for loading and saving using Ibis's backends. | `kedro_datasets.ibis` | + +- Added support for Python 3.12. +- Normalised optional dependencies names for datasets to follow [PEP 685](https://peps.python.org/pep-0685/). The `.` characters have been replaced with `-` in the optional dependencies names. Note that this might be breaking for some users. For example, users should now install optional dependencies for `pandas.ParquetDataset` from `kedro-datasets` like this: -* Added support for Python 3.12. -* Normalised optional dependencies names for datasets to follow [PEP 685](https://peps.python.org/pep-0685/). The `.` characters have been replaced with `-` in the optional dependencies names. Note that this might be breaking for some users. For example, users should now install optional dependencies for `pandas.ParquetDataset` from `kedro-datasets` like this: ```bash pip install kedro-datasets[pandas-parquetdataset] ``` -* Removed `setup.py` and move to `pyproject.toml` completely for `kedro-datasets`. + +- Removed `setup.py` and move to `pyproject.toml` completely for `kedro-datasets`. ## Bug fixes and other changes -* If using MSSQL, `load_args:params` will be typecasted as tuple. -* Fixed bug with loading datasets from Hugging Face. Now allows passing parameters to the load_dataset function. -* Made `connection_args` argument optional when calling `create_connection()` in `sql_dataset.py`. + +- If using MSSQL, `load_args:params` will be typecasted as tuple. +- Fixed bug with loading datasets from Hugging Face. Now allows passing parameters to the load_dataset function. +- Made `connection_args` argument optional when calling `create_connection()` in `sql_dataset.py`. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Riley Brady](https://github.com/riley-brady) -* [Andrew Cao](https://github.com/andrewcao1) -* [Eduardo Romero Lopez](https://github.com/eromerobilbomatica) -* [Jerome Asselin](https://github.com/jerome-asselin-buspatrol) +- [Riley Brady](https://github.com/riley-brady) +- [Andrew Cao](https://github.com/andrewcao1) +- [Eduardo Romero Lopez](https://github.com/eromerobilbomatica) +- [Jerome Asselin](https://github.com/jerome-asselin-buspatrol) # Release 2.1.0 + ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: | Type | Description | Location | -|------------------------|-------------------------------------------------------------|-------------------------| +| ---------------------- | ----------------------------------------------------------- | ----------------------- | | `matlab.MatlabDataset` | A dataset which uses `scipy` to save and load `.mat` files. | `kedro_datasets.matlab` | -* Extended preview feature for matplotlib, plotly and tracking datasets. -* Allowed additional parameters for sqlalchemy engine when using sql datasets. +- Extended preview feature for matplotlib, plotly and tracking datasets. +- Allowed additional parameters for sqlalchemy engine when using sql datasets. ## Bug fixes and other changes -* Removed Windows specific conditions in `pandas.HDFDataset` extra dependencies + +- Removed Windows specific conditions in `pandas.HDFDataset` extra dependencies ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Samuel Lee SJ](https://github.com/samuel-lee-sj) -* [Felipe Monroy](https://github.com/felipemonroy) -* [Manuel Spierenburg](https://github.com/mjspier) + +- [Samuel Lee SJ](https://github.com/samuel-lee-sj) +- [Felipe Monroy](https://github.com/felipemonroy) +- [Manuel Spierenburg](https://github.com/mjspier) # Release 2.0.0 + ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: | Type | Description | Location | -|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------|------------------------------| +| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- | | `huggingface.HFDataset` | A dataset to load Hugging Face datasets using the [datasets](https://pypi.org/project/datasets) library. | `kedro_datasets.huggingface` | | `huggingface.HFTransformerPipelineDataset` | A dataset to load pretrained Hugging Face transformers using the [transformers](https://pypi.org/project/transformers) library. | `kedro_datasets.huggingface` | -* Removed Dataset classes ending with "DataSet", use the "Dataset" spelling instead. -* Removed support for Python 3.7 and 3.8. -* Added [databricks-connect>=13.0](https://docs.databricks.com/en/dev-tools/databricks-connect-ref.html) support for Spark- and Databricks-based datasets. -* Bumped `s3fs` to latest calendar-versioned release. -* `PartitionedDataset` and `IncrementalDataset` now both support versioning of the underlying dataset. +- Removed Dataset classes ending with "DataSet", use the "Dataset" spelling instead. +- Removed support for Python 3.7 and 3.8. +- Added [databricks-connect>=13.0](https://docs.databricks.com/en/dev-tools/databricks-connect-ref.html) support for Spark- and Databricks-based datasets. +- Bumped `s3fs` to latest calendar-versioned release. +- `PartitionedDataset` and `IncrementalDataset` now both support versioning of the underlying dataset. ## Bug fixes and other changes -* Fixed bug with loading models saved with `TensorFlowModelDataset`. -* Made dataset parameters keyword-only. -* Corrected pandas-gbq as py311 dependency. + +- Fixed bug with loading models saved with `TensorFlowModelDataset`. +- Made dataset parameters keyword-only. +- Corrected pandas-gbq as py311 dependency. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Edouard59](https://github.com/Edouard59) -* [Miguel Rodriguez Gutierrez](https://github.com/MigQ2) -* [felixscherz](https://github.com/felixscherz) -* [Onur Kuru](https://github.com/kuruonur1) + +- [Edouard59](https://github.com/Edouard59) +- [Miguel Rodriguez Gutierrez](https://github.com/MigQ2) +- [felixscherz](https://github.com/felixscherz) +- [Onur Kuru](https://github.com/kuruonur1) # Release 1.8.0 + ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: -| Type | Description | Location | -|------------------------------|------------------------------------------------------------------------|-------------------------| -| `polars.LazyPolarsDataset` | A `LazyPolarsDataset` using [polars](https://www.pola.rs/)'s Lazy API. | `kedro_datasets.polars` | +| Type | Description | Location | +| -------------------------- | ---------------------------------------------------------------------- | ----------------------- | +| `polars.LazyPolarsDataset` | A `LazyPolarsDataset` using [polars](https://www.pola.rs/)'s Lazy API. | `kedro_datasets.polars` | -* Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`. -* Renamed `polars.GenericDataSet` to `polars.EagerPolarsDataset` to better reflect the difference between the two dataset classes. -* Added a deprecation warning when using `polars.GenericDataSet` or `polars.GenericDataset` that these have been renamed to `polars.EagerPolarsDataset` -* Delayed backend connection for `pandas.SQLTableDataset`, `pandas.SQLQueryDataset`, and `snowflake.SnowparkTableDataset`. In practice, this means that a dataset's connection details aren't used (or validated) until the dataset is accessed. On the plus side, the cost of connection isn't incurred regardless of when or whether the dataset is used. +- Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`. +- Renamed `polars.GenericDataSet` to `polars.EagerPolarsDataset` to better reflect the difference between the two dataset classes. +- Added a deprecation warning when using `polars.GenericDataSet` or `polars.GenericDataset` that these have been renamed to `polars.EagerPolarsDataset` +- Delayed backend connection for `pandas.SQLTableDataset`, `pandas.SQLQueryDataset`, and `snowflake.SnowparkTableDataset`. In practice, this means that a dataset's connection details aren't used (or validated) until the dataset is accessed. On the plus side, the cost of connection isn't incurred regardless of when or whether the dataset is used. ## Bug fixes and other changes -* Fixed erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks. -* Updated `PickleDataset` to explicitly mention `cloudpickle` support. + +- Fixed erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks. +- Updated `PickleDataset` to explicitly mention `cloudpickle` support. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [PtrBld](https://github.com/PtrBld) -* [Alistair McKelvie](https://github.com/alamastor) -* [Felix Wittmann](https://github.com/hfwittmann) -* [Matthias Roels](https://github.com/MatthiasRoels) + +- [PtrBld](https://github.com/PtrBld) +- [Alistair McKelvie](https://github.com/alamastor) +- [Felix Wittmann](https://github.com/hfwittmann) +- [Matthias Roels](https://github.com/MatthiasRoels) # Release 1.7.1 + ## Bug fixes and other changes -* Pinned `tables` version on `kedro-datasets` for Python < 3.8. + +- Pinned `tables` version on `kedro-datasets` for Python < 3.8. ## Upcoming deprecations for Kedro-Datasets 2.0.0 -* Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0. + +- Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0. # Release 1.7.0: + ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: -| Type | Description | Location | -|---------------------------|----------------------------------------------------------------------------------------------------------------------------|-------------------------| -| `polars.GenericDataSet` | A `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | +| Type | Description | Location | +| ----------------------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------- | +| `polars.GenericDataSet` | A `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | ## Bug fixes and other changes -* Fixed broken links in docstrings. -* Reverted PySpark pin to <4.0. + +- Fixed broken links in docstrings. +- Reverted PySpark pin to <4.0. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Walber Moreira](https://github.com/wmoreiraa) + +- [Walber Moreira](https://github.com/wmoreiraa) # Release 1.6.0: ## Major features and improvements -* Added support for Python 3.11. + +- Added support for Python 3.11. # Release 1.5.3: + ## Bug fixes and other changes -* Made `databricks.ManagedTableDataSet` read-only by default. - * The user needs to specify `write_mode` to allow `save` on the data set. -* Fixed an issue on `api.APIDataSet` where the sent data was doubly converted to json + +- Made `databricks.ManagedTableDataSet` read-only by default. + - The user needs to specify `write_mode` to allow `save` on the data set. +- Fixed an issue on `api.APIDataSet` where the sent data was doubly converted to json string (once by us and once by the `requests` library). -* Fixed problematic `kedro-datasets` optional dependencies, revert to `setup.py` +- Fixed problematic `kedro-datasets` optional dependencies, revert to `setup.py` ## Community contributions + # Release 1.5.2: ## Bug fixes and other changes -* Fixed problematic `kedro-datasets` optional dependencies. + +- Fixed problematic `kedro-datasets` optional dependencies. # Release 1.5.1: ## Bug fixes and other changes -* Fixed problematic docstrings in `pandas.DeltaTableDataSet` causing Read the Docs builds on Kedro to fail. + +- Fixed problematic docstrings in `pandas.DeltaTableDataSet` causing Read the Docs builds on Kedro to fail. # Release 1.5.0 ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: | Type | Description | Location | -| -------------------------- |--------------------------------------|-------------------------| +| -------------------------- | ------------------------------------ | ----------------------- | | `pandas.DeltaTableDataSet` | A dataset to work with delta tables. | `kedro_datasets.pandas` | -* Implemented lazy loading of dataset subpackages and classes. - * Suppose that SQLAlchemy, a Python SQL toolkit, is installed in your Python environment. With this change, the SQLAlchemy library will not be loaded (for `pandas.SQLQueryDataSet` or `pandas.SQLTableDataSet`) if you load a different pandas dataset (e.g. `pandas.CSVDataSet`). -* Added automatic inference of file format for `pillow.ImageDataSet` to be passed to `save()`. +- Implemented lazy loading of dataset subpackages and classes. + - Suppose that SQLAlchemy, a Python SQL toolkit, is installed in your Python environment. With this change, the SQLAlchemy library will not be loaded (for `pandas.SQLQueryDataSet` or `pandas.SQLTableDataSet`) if you load a different pandas dataset (e.g. `pandas.CSVDataSet`). +- Added automatic inference of file format for `pillow.ImageDataSet` to be passed to `save()`. ## Bug fixes and other changes -* Improved error messages for missing dataset dependencies. - * Suppose that SQLAlchemy, a Python SQL toolkit, is not installed in your Python environment. Previously, `from kedro_datasets.pandas import SQLQueryDataSet` or `from kedro_datasets.pandas import SQLTableDataSet` would result in `ImportError: cannot import name 'SQLTableDataSet' from 'kedro_datasets.pandas'`. Now, the same imports raise the more helpful and intuitive `ModuleNotFoundError: No module named 'sqlalchemy'`. + +- Improved error messages for missing dataset dependencies. + - Suppose that SQLAlchemy, a Python SQL toolkit, is not installed in your Python environment. Previously, `from kedro_datasets.pandas import SQLQueryDataSet` or `from kedro_datasets.pandas import SQLTableDataSet` would result in `ImportError: cannot import name 'SQLTableDataSet' from 'kedro_datasets.pandas'`. Now, the same imports raise the more helpful and intuitive `ModuleNotFoundError: No module named 'sqlalchemy'`. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [Daniel-Falk](https://github.com/daniel-falk) -* [afaqueahmad7117](https://github.com/afaqueahmad7117) -* [everdark](https://github.com/everdark) +- [Daniel-Falk](https://github.com/daniel-falk) +- [afaqueahmad7117](https://github.com/afaqueahmad7117) +- [everdark](https://github.com/everdark) # Release 1.4.2 + ## Bug fixes and other changes -* Fixed documentations of `GeoJSONDataSet` and `SparkStreamingDataSet`. -* Fixed problematic docstrings causing Read the Docs builds on Kedro to fail. + +- Fixed documentations of `GeoJSONDataSet` and `SparkStreamingDataSet`. +- Fixed problematic docstrings causing Read the Docs builds on Kedro to fail. # Release 1.4.1: ## Bug fixes and other changes -* Fixed missing `pickle.PickleDataSet` extras in `setup.py`. + +- Fixed missing `pickle.PickleDataSet` extras in `setup.py`. # Release 1.4.0: ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: | Type | Description | Location | -|-------------------------------|-----------------------------------------------------|------------------------| +| ----------------------------- | --------------------------------------------------- | ---------------------- | | `spark.SparkStreamingDataSet` | A dataset to work with PySpark Streaming DataFrame. | `kedro_datasets.spark` | ## Bug fixes and other changes -* Fixed problematic docstrings of `APIDataSet`. + +- Fixed problematic docstrings of `APIDataSet`. # Release 1.3.0: ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: | Type | Description | Location | -|----------------------------------|---------------------------------------------------------|-----------------------------| +| -------------------------------- | ------------------------------------------------------- | --------------------------- | | `databricks.ManagedTableDataSet` | A dataset to access managed delta tables in Databricks. | `kedro_datasets.databricks` | -* Added pandas 2.0 support. -* Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). -* Added a save method to `APIDataSet`. -* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. -* Relaxed Kedro version pin to `>=0.16`. -* Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins. +- Added pandas 2.0 support. +- Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). +- Added a save method to `APIDataSet`. +- Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. +- Relaxed Kedro version pin to `>=0.16`. +- Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins. ## Bug fixes and other changes -* Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. -* Upgraded required `polars` version to 0.17. -* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in Kedro-Datasets. + +- Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. +- Upgraded required `polars` version to 0.17. +- Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in Kedro-Datasets. ## Community contributions + Many thanks to the following Kedroids for contributing PRs to this release: -* [BrianCechmanek](https://github.com/BrianCechmanek) -* [McDonnellJoseph](https://github.com/McDonnellJoseph) -* [Danny Farah](https://github.com/dannyrfar) +- [BrianCechmanek](https://github.com/BrianCechmanek) +- [McDonnellJoseph](https://github.com/McDonnellJoseph) +- [Danny Farah](https://github.com/dannyrfar) # Release 1.2.0: ## Major features and improvements -* Added `fsspec` resolution in `SparkDataSet` to support more filesystems. -* Added the `_preview` method to the Pandas `ExcelDataSet` and `CSVDataSet` classes. + +- Added `fsspec` resolution in `SparkDataSet` to support more filesystems. +- Added the `_preview` method to the Pandas `ExcelDataSet` and `CSVDataSet` classes. ## Bug fixes and other changes -* Fixed a docstring in the Pandas `SQLQueryDataSet` as part of the Sphinx revamp on Kedro. + +- Fixed a docstring in the Pandas `SQLQueryDataSet` as part of the Sphinx revamp on Kedro. # Release 1.1.1: ## Bug fixes and other changes -* Fixed problematic docstrings causing Read the Docs builds on Kedro to fail. +- Fixed problematic docstrings causing Read the Docs builds on Kedro to fail. # Release 1.1.0: ## Major features and improvements -* Added the following new datasets: +- Added the following new datasets: | Type | Description | Location | | -------------------------------- | --------------------------------------------------------------------------------------------------------------------- | -------------------------- | @@ -358,21 +416,23 @@ Many thanks to the following Kedroids for contributing PRs to this release: | `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | ## Bug fixes and other changes -* Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. -* Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix. + +- Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. +- Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix. # Release 1.0.2: ## Bug fixes and other changes -* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. -* Relaxed PyArrow range in line with pandas. -* Fixed outdated links to the dill package documentation. + +- Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. +- Relaxed PyArrow range in line with pandas. +- Fixed outdated links to the dill package documentation. # Release 1.0.1: ## Bug fixes and other changes -* Fixed docstring formatting in `VideoDataSet` that was causing the documentation builds to fail. +- Fixed docstring formatting in `VideoDataSet` that was causing the documentation builds to fail. # Release 1.0.0: @@ -383,7 +443,8 @@ Datasets are Kedro’s way of dealing with input and output in a data and machin The datasets have always been part of the core Kedro Framework project inside `kedro.extras`. In Kedro `0.19.0`, we will remove datasets from Kedro to reduce breaking changes associated with dataset dependencies. Instead, users will need to use the datasets from the `kedro-datasets` repository instead. ## Major features and improvements -* Changed `pandas.ParquetDataSet` to load data using pandas instead of parquet. + +- Changed `pandas.ParquetDataSet` to load data using pandas instead of parquet. # Release 0.1.0: @@ -391,5 +452,4 @@ The initial release of Kedro-Datasets. ## Thanks to our main contributors - We are also grateful to everyone who advised and supported us, filed issues or helped resolve them, asked and answered questions and were part of inspiring discussions. diff --git a/kedro-datasets/docs/source/api/kedro_datasets.rst b/kedro-datasets/docs/source/api/kedro_datasets.rst index 45b275de5..3da276e87 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets.rst @@ -21,6 +21,7 @@ kedro_datasets holoviews.HoloviewsWriter huggingface.HFDataset huggingface.HFTransformerPipelineDataset + ibis.FileDataset ibis.TableDataset json.JSONDataset matlab.MatlabDataset diff --git a/kedro-datasets/kedro_datasets/ibis/__init__.py b/kedro-datasets/kedro_datasets/ibis/__init__.py index 7e793c4e0..47867f657 100644 --- a/kedro-datasets/kedro_datasets/ibis/__init__.py +++ b/kedro-datasets/kedro_datasets/ibis/__init__.py @@ -4,8 +4,10 @@ import lazy_loader as lazy # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +FileDataset: Any TableDataset: Any __getattr__, __dir__, __all__ = lazy.attach( - __name__, submod_attrs={"table_dataset": ["TableDataset"]} + __name__, + submod_attrs={"file_dataset": ["FileDataset"], "table_dataset": ["TableDataset"]}, ) diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py new file mode 100644 index 000000000..11b58bc32 --- /dev/null +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -0,0 +1,195 @@ +"""Provide file loading and saving functionality for Ibis's backends.""" +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path, PurePosixPath +from typing import TYPE_CHECKING, Any, ClassVar + +import ibis.expr.types as ir +from kedro.io import AbstractVersionedDataset, DatasetError, Version + +if TYPE_CHECKING: + from ibis import BaseBackend + + +class FileDataset(AbstractVersionedDataset[ir.Table, ir.Table]): + """``FileDataset`` loads/saves data from/to a specified file format. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: ibis.FileDataset + filepath: data/01_raw/company/cars.csv + file_format: csv + table_name: cars + connection: + backend: duckdb + database: company.db + load_args: + sep: "," + nullstr: "#NA" + save_args: + sep: "," + nullstr: "#NA" + + motorbikes: + type: ibis.FileDataset + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes/ + file_format: delta + table_name: motorbikes + connection: + backend: polars + + Example usage for the + `Python API `_: + + .. code-block:: pycon + + >>> import ibis + >>> from kedro_datasets.ibis import FileDataset + >>> + >>> data = ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + >>> + >>> dataset = FileDataset( + ... filepath=tmp_path / "test.csv", + ... file_format="csv", + ... table_name="test", + ... connection={"backend": "duckdb", "database": tmp_path / "file.db"}, + ... ) + >>> dataset.save(data) + >>> reloaded = dataset.load() + >>> assert data.execute().equals(reloaded.execute()) + + """ + + DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} + DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {} + + _connections: ClassVar[dict[tuple[tuple[str, str]], BaseBackend]] = {} + + def __init__( # noqa: PLR0913 + self, + filepath: str, + file_format: str = "parquet", + *, + table_name: str | None = None, + connection: dict[str, Any] | None = None, + load_args: dict[str, Any] | None = None, + save_args: dict[str, Any] | None = None, + version: Version | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + """Creates a new ``FileDataset`` pointing to the given filepath. + + ``FileDataset`` connects to the Ibis backend object constructed + from the connection configuration. The `backend` key provided in + the config can be any of the `supported backends `_. The remaining dictionary entries will be + passed as arguments to the underlying ``connect()`` method (e.g. + `ibis.duckdb.connect() `_). + + The read method corresponding to the given ``file_format`` (e.g. + `read_csv() `_) is used to load + the file with the backend. Note that only the data is loaded; no + link to the underlying file exists past ``FileDataset.load()``. + + Args: + filepath: Path to a file to register as a table. Most useful + for loading data into your data warehouse (for testing). + On save, the backend exports data to the specified path. + file_format: String specifying the file format for the file. + Defaults to writing execution results to a Parquet file. + table_name: The name to use for the created table (on load). + connection: Configuration for connecting to an Ibis backend. + load_args: Additional arguments passed to the Ibis backend's + `read_{file_format}` method. + save_args: Additional arguments passed to the Ibis backend's + `to_{file_format}` method. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + metadata: Any arbitrary metadata. This is ignored by Kedro, + but may be consumed by users or external plugins. + """ + self._file_format = file_format + self._table_name = table_name + self._connection_config = connection + self.metadata = metadata + + super().__init__( + filepath=PurePosixPath(filepath), + version=version, + exists_function=lambda filepath: Path(filepath).exists(), + ) + + # Set load and save arguments, overwriting defaults if provided. + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + @property + def connection(self) -> BaseBackend: + """The ``Backend`` instance for the connection configuration.""" + + def hashable(value): + """Return a hashable key for a potentially-nested object.""" + if isinstance(value, dict): + return tuple((k, hashable(v)) for k, v in sorted(value.items())) + if isinstance(value, list): + return tuple(hashable(x) for x in value) + return value + + cls = type(self) + key = hashable(self._connection_config) + if key not in cls._connections: + import ibis + + config = deepcopy(self._connection_config) + backend_attr = config.pop("backend") if config else None + backend = getattr(ibis, backend_attr) + cls._connections[key] = backend.connect(**config) + + return cls._connections[key] + + def load(self) -> ir.Table: + load_path = self._get_load_path() + reader = getattr(self.connection, f"read_{self._file_format}") + return reader(load_path, self._table_name, **self._load_args) + + def save(self, data: ir.Table) -> None: + save_path = self._get_save_path() + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + writer = getattr(self.connection, f"to_{self._file_format}") + writer(data, save_path, **self._save_args) + + def _describe(self) -> dict[str, Any]: + return { + "filepath": self._filepath, + "file_format": self._file_format, + "table_name": self._table_name, + "backend": self._connection_config.get("backend") + if self._connection_config + else None, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _exists(self) -> bool: + try: + load_path = self._get_load_path() + except DatasetError: + return False + + return Path(load_path).exists() diff --git a/kedro-datasets/kedro_datasets/ibis/table_dataset.py b/kedro-datasets/kedro_datasets/ibis/table_dataset.py index cf92736cd..7550e6266 100644 --- a/kedro-datasets/kedro_datasets/ibis/table_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/table_dataset.py @@ -1,12 +1,15 @@ """Provide data loading and saving functionality for Ibis's backends.""" from __future__ import annotations +import warnings from copy import deepcopy from typing import TYPE_CHECKING, Any, ClassVar import ibis.expr.types as ir from kedro.io import AbstractDataset, DatasetError +from kedro_datasets import KedroDeprecationWarning + if TYPE_CHECKING: from ibis import BaseBackend @@ -21,15 +24,10 @@ class TableDataset(AbstractDataset[ir.Table, ir.Table]): cars: type: ibis.TableDataset - filepath: data/01_raw/company/cars.csv - file_format: csv table_name: cars connection: backend: duckdb database: company.db - load_args: - sep: "," - nullstr: "#NA" save_args: materialized: table @@ -91,12 +89,6 @@ def __init__( # noqa: PLR0913 `ibis.duckdb.connect() `_). - If ``filepath`` and ``file_format`` are given, the corresponding - read method (e.g. `read_csv() `_) is used to load - the file with the backend. Note that only the data is loaded; no - link to the underlying file exists past ``TableDataset.load()``. - If ``table_name`` is given (and ``filepath`` isn't), the dataset establishes a connection to the relevant table for the execution backend. Therefore, Ibis doesn't fetch data on load; all compute @@ -105,9 +97,6 @@ def __init__( # noqa: PLR0913 is saved, after running code defined across one more more nodes. Args: - filepath: Path to a file to register as a table. Most useful - for loading data into your data warehouse (for testing). - file_format: Specifies the input file format for `filepath`. table_name: The name of the table or view to read or create. connection: Configuration for connecting to an Ibis backend. load_args: Additional arguments passed to the Ibis backend's @@ -125,6 +114,15 @@ def __init__( # noqa: PLR0913 "Must provide at least one of `filepath` or `table_name`." ) + if filepath is not None or file_format is not None: + warnings.warn( + "Use 'FileDataset' to load and save files with an Ibis " + "backend; the functionality will be removed from 'Table" + "Dataset' in Kedro-Datasets 6.0.0", + KedroDeprecationWarning, + stacklevel=2, + ) + self._filepath = filepath self._file_format = file_format self._table_name = table_name @@ -144,7 +142,10 @@ def __init__( # noqa: PLR0913 @property def connection(self) -> BaseBackend: + """The ``Backend`` instance for the connection configuration.""" + def hashable(value): + """Return a hashable key for a potentially-nested object.""" if isinstance(value, dict): return tuple((k, hashable(v)) for k, v in sorted(value.items())) if isinstance(value, list): diff --git a/kedro-datasets/tests/ibis/test_file_dataset.py b/kedro-datasets/tests/ibis/test_file_dataset.py new file mode 100644 index 000000000..e598bffff --- /dev/null +++ b/kedro-datasets/tests/ibis/test_file_dataset.py @@ -0,0 +1,274 @@ +import sys +from pathlib import Path +from time import sleep + +import ibis +import pytest +from kedro.io import DatasetError, Version +from kedro.io.core import generate_timestamp +from pandas.testing import assert_frame_equal + +from kedro_datasets.ibis import FileDataset + + +@pytest.fixture +def filepath_csv(tmp_path): + return (tmp_path / "test.csv").as_posix() + + +@pytest.fixture +def database(tmp_path): + return (tmp_path / "file.db").as_posix() + + +@pytest.fixture(params=[None]) +def connection_config(request, database): + return request.param or {"backend": "duckdb", "database": database} + + +@pytest.fixture +def file_dataset(filepath_csv, connection_config, load_args, save_args): + return FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + load_args=load_args, + save_args=save_args, + ) + + +@pytest.fixture +def versioned_file_dataset(filepath_csv, connection_config, load_version, save_version): + return FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(load_version, save_version), + ) + + +@pytest.fixture +def dummy_table(): + return ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +class TestFileDataset: + def test_save_and_load(self, file_dataset, dummy_table, database): + """Test saving and reloading the data set.""" + file_dataset.save(dummy_table) + reloaded = file_dataset.load() + assert_frame_equal(dummy_table.execute(), reloaded.execute()) + + @pytest.mark.parametrize("load_args", [{"filename": True}], indirect=True) + def test_load_extra_params(self, file_dataset, load_args, dummy_table): + """Test overriding the default load arguments.""" + file_dataset.save(dummy_table) + assert "filename" in file_dataset.load() + + @pytest.mark.parametrize("save_args", [{"sep": "|"}], indirect=True) + def test_save_extra_params( + self, file_dataset, save_args, dummy_table, filepath_csv + ): + """Test overriding the default save arguments.""" + file_dataset.save(dummy_table) + + # Verify that the delimiter character from `save_args` was used. + with open(filepath_csv) as f: + for line in f: + assert save_args["sep"] in line + + @pytest.mark.parametrize( + ("connection_config", "key"), + [ + ( + {"backend": "duckdb", "database": "file.db", "extensions": ["spatial"]}, + ( + ("backend", "duckdb"), + ("database", "file.db"), + ("extensions", ("spatial",)), + ), + ), + # https://github.com/kedro-org/kedro-plugins/pull/560#discussion_r1536083525 + ( + { + "host": "xxx.sql.azuresynapse.net", + "database": "xxx", + "query": {"driver": "ODBC Driver 17 for SQL Server"}, + "backend": "mssql", + }, + ( + ("backend", "mssql"), + ("database", "xxx"), + ("host", "xxx.sql.azuresynapse.net"), + ("query", (("driver", "ODBC Driver 17 for SQL Server"),)), + ), + ), + ], + indirect=["connection_config"], + ) + def test_connection_config(self, mocker, file_dataset, connection_config, key): + """Test hashing of more complicated connection configuration.""" + mocker.patch(f"ibis.{connection_config['backend']}") + file_dataset.load() + assert key in file_dataset._connections + + +class TestFileDatasetVersioned: + def test_version_str_repr(self, connection_config, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.csv" + ds = FileDataset( + filepath=filepath, file_format="csv", connection=connection_config + ) + ds_versioned = FileDataset( + filepath=filepath, + file_format="csv", + connection=connection_config, + version=Version(load_version, save_version), + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "FileDataset" in str(ds_versioned) + assert "FileDataset" in str(ds) + # Default save_args + assert "save_args={}" in str(ds) + assert "save_args={}" in str(ds_versioned) + + def test_save_and_load(self, versioned_file_dataset, dummy_table): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_file_dataset.save(dummy_table) + reloaded = versioned_file_dataset.load() + assert_frame_equal(dummy_table.execute(), reloaded.execute()) + + def test_multiple_loads( + self, versioned_file_dataset, dummy_table, filepath_csv, connection_config + ): + """Test that if a new version is created mid-run, by an + external system, it won't be loaded in the current run.""" + versioned_file_dataset.save(dummy_table) + versioned_file_dataset.load() + v1 = versioned_file_dataset.resolve_load_version() + + sleep(0.5) + # force-drop a newer version into the same location + v_new = generate_timestamp() + FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(v_new, v_new), + ).save(dummy_table) + + versioned_file_dataset.load() + v2 = versioned_file_dataset.resolve_load_version() + + assert v2 == v1 # v2 should not be v_new! + ds_new = FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(None, None), + ) + assert ( + ds_new.resolve_load_version() == v_new + ) # new version is discoverable by a new instance + + def test_multiple_saves(self, dummy_table, filepath_csv, connection_config): + """Test multiple cycles of save followed by load for the same dataset""" + ds_versioned = FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(None, None), + ) + + # first save + ds_versioned.save(dummy_table) + first_save_version = ds_versioned.resolve_save_version() + first_load_version = ds_versioned.resolve_load_version() + assert first_load_version == first_save_version + + # second save + sleep(0.5) + ds_versioned.save(dummy_table) + second_save_version = ds_versioned.resolve_save_version() + second_load_version = ds_versioned.resolve_load_version() + assert second_load_version == second_save_version + assert second_load_version > first_load_version + + # another dataset + ds_new = FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(None, None), + ) + assert ds_new.resolve_load_version() == second_load_version + + def test_no_versions(self, versioned_file_dataset): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for FileDataset\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_file_dataset.load() + + def test_exists(self, versioned_file_dataset, dummy_table): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_file_dataset.exists() + versioned_file_dataset.save(dummy_table) + assert versioned_file_dataset.exists() + + def test_prevent_overwrite(self, versioned_file_dataset, dummy_table): + """Check the error when attempting to override the data set if the + corresponding CSV file for a given save version already exists.""" + versioned_file_dataset.save(dummy_table) + pattern = ( + r"Save path \'.+\' for FileDataset\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_file_dataset.save(dummy_table) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_file_dataset, load_version, save_version, dummy_table + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for FileDataset\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_file_dataset.save(dummy_table) + + @pytest.mark.skipif(sys.platform == "win32", reason="different error on windows") + def test_versioning_existing_dataset( + self, file_dataset, versioned_file_dataset, dummy_table + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + file_dataset.save(dummy_table) + assert file_dataset.exists() + assert file_dataset._filepath == versioned_file_dataset._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_file_dataset._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_file_dataset.save(dummy_table) + + # Remove non-versioned dataset and try again + Path(file_dataset._filepath.as_posix()).unlink() + versioned_file_dataset.save(dummy_table) + assert versioned_file_dataset.exists()