Merge pull request #223 from openfoodfacts/product-dataset-flavor

feat: allow fetching other datasets (obf, opff, opf)
openfoodfacts · Apr 2, 2024 · 36a9625 · 36a9625
2 parents 70c6f6b + 7862959
commit 36a9625
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 17 deletions.
diff --git a/docs/usage.md b/docs/usage.md
@@ -26,11 +26,10 @@ All parameters are optional with the exception of user_agent, but here is a desc
 
 - `username` and `password` are used to provide authentication (required for write requests)
 - `country` is used to specify the country, which is used by the API to return product specific to the country or to infer which language to use by default. `world` (all products) is the default value
-- `flavor`: the Open*Facts project you want to interact with: `off` (Open Food Facts, default), `obf` (Open Beauty Facts),...
+- `flavor`: the Open*Facts project you want to interact with: `off` (Open Food Facts, default), `obf` (Open Beauty Facts), `opff` (Open Pet Food Facts), `opf` (Open Products Facts)
 - `version`: API version (v2 is the default)
 - `environment`: either `org` for production environment (openfoodfacts.org) or `net` for staging (openfoodfacts.net)
 
-
 *Get information about a product*
 
 ```python
@@ -57,25 +56,32 @@ want to update. Example:
 
 ## Using the dataset
 
-If you're planning to perform data analysis on Open Food Facts, the easiest way is to download and use the Open Food Facts dataset dump.
-Fortunately it can be done really easily using the SDK:
+If you're planning to perform data analysis on Open Food Facts, the easiest way is to download and use the Open Food Facts dataset dump. Fortunately it can be done really easily using the SDK:
 
 ```python
 from openfoodfacts import ProductDataset
 
-dataset = ProductDataset("csv")
+dataset = ProductDataset(dataset_type="csv")
 
 for product in dataset:
     print(product["product_name"])
 ```
 
-With `dataset = ProductDataset("csv")`, we automatically download (and cache) the dataset. We can then iterate over it to get information about products.
+With `dataset = ProductDataset(dataset_type="csv")`, we automatically download (and cache) the food dataset. We can then iterate over it to get information about products.
+
+Two dataset types are available `csv` and `jsonl`. The `jsonl` dataset contains all the Open Food Facts database information but takes much more storage (>5 GB), while the `csv` dataset is much ligher (~800 MB) but only contains the most important fields. The `jsonl` dataset type is used by default.
 
-Two dataset types are available `csv` and `jsonl`. The `jsonl` dataset contains all the Open Food Facts database information but takes much more storage (>5 GB), while the `csv` dataset is much ligher (~700 MB) but only contains the most important fields.
+You can also use `ProductDataset` to fetch other non-food datasets:
+
+```python
+from openfoodfacts import ProductDataset
 
-The `jsonl` dataset type is used by default.
+dataset = ProductDataset(dataset_type="csv")
 
+for product in dataset:
+    print(product["product_name"])
+```
 
 ## Taxonomies
 
-For a deep dive on how to handle taxonomies, check out the [dedicated page](./handle_taxonomies.md).
+For a deep dive on how to handle taxonomies, check out the [dedicated page](./handle_taxonomies.md).
diff --git a/openfoodfacts/dataset.py b/openfoodfacts/dataset.py
@@ -17,12 +17,27 @@
 
 DEFAULT_CACHE_DIR = Path("~/.cache/openfoodfacts/datasets").expanduser()
 DATASET_FILE_NAMES = {
-    DatasetType.jsonl: "openfoodfacts-products.jsonl.gz",
-    DatasetType.csv: "en.openfoodfacts.org.products.csv.gz",
+    Flavor.off: {
+        DatasetType.jsonl: "openfoodfacts-products.jsonl.gz",
+        DatasetType.csv: "en.openfoodfacts.org.products.csv.gz",
+    },
+    Flavor.obf: {
+        DatasetType.jsonl: "openbeautyfacts-products.jsonl.gz",
+        DatasetType.csv: "en.openbeautyfacts.org.products.csv",
+    },
+    Flavor.opff: {
+        DatasetType.jsonl: "openpetfoodfacts-products.jsonl.gz",
+        DatasetType.csv: "en.openpetfoodfacts.org.products.csv",
+    },
+    Flavor.opf: {
+        DatasetType.jsonl: "openproductsfacts-products.jsonl.gz",
+        DatasetType.csv: "en.openproductsfacts.org.products.csv",
+    },
 }
 
 
 def get_dataset(
+    flavor: Flavor = Flavor.off,
     dataset_type: DatasetType = DatasetType.jsonl,
     force_download: bool = False,
     download_newer: bool = False,
@@ -33,7 +48,8 @@ def get_dataset(
     The dataset is downloaded the first time and subsequently cached in
     `~/.cache/openfoodfacts/datasets`.
 
-    :param dataset_type: The, defaults to DatasetType.jsonl
+    :param flavor: The data source, defaults to Flavor.off
+    :param dataset_type: The returned format, defaults to DatasetType.jsonl
     :param force_download: if True, (re)download the dataset even if it was
         cached, defaults to False
     :param download_newer: if True, download the dataset if a more recent
@@ -43,9 +59,9 @@ def get_dataset(
     :return: the path of the dataset
     """
     cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir
-    file_name = DATASET_FILE_NAMES[dataset_type]
+    file_name = DATASET_FILE_NAMES[flavor][dataset_type]
     dataset_path = cache_dir / file_name
-    url = f"{URLBuilder.static(Flavor.off, Environment.org)}/data/{file_name}"
+    url = f"{URLBuilder.static(flavor, Environment.org)}/data/{file_name}"
     cache_dir.mkdir(parents=True, exist_ok=True)
 
     if not should_download_file(url, dataset_path, force_download, download_newer):
@@ -57,7 +73,12 @@ def get_dataset(
 
 
 class ProductDataset:
-    def __init__(self, dataset_type: DatasetType = DatasetType.jsonl, **kwargs):
+    def __init__(
+        self,
+        flavor: Flavor = Flavor.off,
+        dataset_type: DatasetType = DatasetType.jsonl,
+        **kwargs,
+    ):
         """A product dataset.
 
         This class is used to iterate over the Open Food Facts dataset.
@@ -66,7 +87,7 @@ def __init__(self, dataset_type: DatasetType = DatasetType.jsonl, **kwargs):
             to DatasetType.jsonl
         """
         self.dataset_type = dataset_type
-        self.dataset_path = get_dataset(dataset_type, **kwargs)
+        self.dataset_path = get_dataset(flavor, dataset_type, **kwargs)
 
     def __iter__(self):
         if self.dataset_type is DatasetType.jsonl:
@@ -86,5 +107,4 @@ def count(self) -> int:
         count = 0
         for _ in self:
             count += 1
-
         return count