Skip to content

Commit

Permalink
Merge branch 'main' into add-grouped-natural-id-partitioner
Browse files Browse the repository at this point in the history
  • Loading branch information
jafermarq authored Aug 22, 2024
2 parents 2c5500c + 2689ee5 commit 058d846
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 20 deletions.
33 changes: 13 additions & 20 deletions datasets/doc/source/how-to-use-with-local-data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,6 @@ CSV
data_files = [ "path-to-my-file-1.csv", "path-to-my-file-2.csv", ...]
dataset = load_dataset("csv", data_files=data_files)
# Divided Dataset
data_files = {
"train": single_train_file_or_list_of_files,
"test": single_test_file_or_list_of_files,
"can-have-more-splits": ...
}
dataset = load_dataset("csv", data_files=data_files)
partitioner = ChosenPartitioner(...)
partitioner.dataset = dataset
partition = partitioner.load_partition(partition_id=0)
Expand All @@ -60,18 +52,10 @@ JSON
# Single file
data_files = "path-to-my-file.json"
# Multitple Files
# Multiple Files
data_files = [ "path-to-my-file-1.json", "path-to-my-file-2.json", ...]
dataset = load_dataset("json", data_files=data_files)
# Divided Dataset
data_files = {
"train": single_train_file_or_list_of_files,
"test": single_test_file_or_list_of_files,
"can-have-more-splits": ...
}
dataset = load_dataset("json", data_files=data_files)
partitioner = ChosenPartitioner(...)
partitioner.dataset = dataset
partition = partitioner.load_partition(partition_id=0)
Expand Down Expand Up @@ -103,7 +87,12 @@ Then, the path you can give is `./mnist`.
from flwr_datasets.partitioner import ChosenPartitioner
# Directly from a directory
dataset = load_dataset("imagefolder", data_dir="/path/to/folder")
dataset_dict = load_dataset("imagefolder", data_dir="/path/to/folder")
# Note that what we just loaded is a DatasetDict, we need to choose a single split
# and assign it to the partitioner.dataset
# e.g. "train" split but that depends on the structure of your directory
dataset = dataset_dict["train"]
partitioner = ChosenPartitioner(...)
partitioner.dataset = dataset
partition = partitioner.load_partition(partition_id=0)
Expand Down Expand Up @@ -134,7 +123,11 @@ Analogously to the image datasets, there are two methods here:
from datasets import load_dataset
from flwr_datasets.partitioner import ChosenPartitioner
dataset = load_dataset("audiofolder", data_dir="/path/to/folder")
dataset_dict = load_dataset("audiofolder", data_dir="/path/to/folder")
# Note that what we just loaded is a DatasetDict, we need to choose a single split
# and assign it to the partitioner.dataset
# e.g. "train" split but that depends on the structure of your directory
dataset = dataset_dict["train"]
partitioner = ChosenPartitioner(...)
partitioner.dataset = dataset
Expand Down Expand Up @@ -230,7 +223,7 @@ Partitioner abstraction is designed to allow for a single dataset assignment.

.. code-block:: python
partitioner.dataset = your_dataset
partitioner.dataset = your_dataset # (your_dataset must be of type dataset.Dataset)
If you need to do the same partitioning on a different dataset, create a new Partitioner
for that, e.g.:
Expand Down
5 changes: 5 additions & 0 deletions datasets/flwr_datasets/partitioner/partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def dataset(self, value: Dataset) -> None:
"created partitions (in case the partitioning scheme needs to create "
"the full partitioning also in order to return a single partition)."
)
if not isinstance(value, Dataset):
raise TypeError(
f"The dataset object you want to assign to the partitioner should be "
f"of type `datasets.Dataset` but given {type(value)}."
)
self._dataset = value

@abstractmethod
Expand Down
59 changes: 59 additions & 0 deletions datasets/flwr_datasets/partitioner/partitioner_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Abstract partitioner tests."""


import unittest

import datasets
from datasets import Dataset
from flwr_datasets.partitioner.partitioner import Partitioner


class DummyPartitioner(Partitioner):
"""Dummy partitioner for testing."""

def load_partition(self, partition_id: int) -> Dataset:
"""Return always a dummy dataset."""
return datasets.Dataset.from_dict({"feature": [0, 1, 2]})

@property
def num_partitions(self) -> int:
"""Return always 0."""
return 0


class TestPartitioner(unittest.TestCase):
"""Test Partitioner."""

def test_dataset_setter_incorrect_type(self) -> None:
"""Test if the incorrect type of the dataset to dataset.setter method raises."""
train_split = datasets.Dataset.from_dict({"feature": [0, 1, 2]})
test_split = datasets.Dataset.from_dict({"feature": [0, 1, 2]})
dataset = datasets.DatasetDict({"train": train_split, "test": test_split})
partitioner = DummyPartitioner()

with self.assertRaises(Exception) as context:
partitioner.dataset = dataset
self.assertIn(
"The dataset object you want to assign to the partitioner should be of "
"type `datasets.Dataset` but given "
"<class 'datasets.dataset_dict.DatasetDict'>.",
str(context.exception),
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 058d846

Please sign in to comment.