Skip to content

Commit

Permalink
Add optional file-based listings caching
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed May 9, 2024
1 parent da77548 commit dc95a69
Show file tree
Hide file tree
Showing 21 changed files with 577 additions and 222 deletions.
12 changes: 10 additions & 2 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ Base Classes
fsspec.core.OpenFiles
fsspec.core.get_fs_token_paths
fsspec.core.url_to_fs
fsspec.dircache.DirCache
fsspec.dircache.DisabledListingsCache
fsspec.dircache.MemoryListingsCache
fsspec.dircache.FileListingsCache
fsspec.FSMap
fsspec.generic.GenericFileSystem
fsspec.registry.register_implementation
Expand Down Expand Up @@ -82,7 +84,13 @@ Base Classes

.. autofunction:: fsspec.core.url_to_fs

.. autoclass:: fsspec.dircache.DirCache
.. autoclass:: fsspec.dircache.DisabledListingsCache
:members: __init__

.. autoclass:: fsspec.dircache.MemoryListingsCache
:members: __init__

.. autoclass:: fsspec.dircache.FileListingsCache
:members: __init__

.. autoclass:: fsspec.FSMap
Expand Down
8 changes: 8 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Changelog
=========

Dev
--------

Enhancements

- add file-based listing cache using diskcache (#895)
warning: use new ``listings_cache_options`` instead of ``use_listings_cache`` etc.

2024.3.1
--------

Expand Down
29 changes: 20 additions & 9 deletions docs/source/features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,26 @@ Listings Caching
----------------

For some implementations, getting file listings (i.e., ``ls`` and anything that
depends on it) is expensive. These implementations use dict-like instances of
:class:`fsspec.dircache.DirCache` to manage the listings.

The cache allows for time-based expiry of entries with the ``listings_expiry_time``
parameter, or LRU expiry with the ``max_paths`` parameter. These can be
set on any implementation instance that uses listings caching; or to skip the
caching altogether, use ``use_listings_cache=False``. That would be appropriate
when the target location is known to be volatile because it is being written
to from other sources.
depends on it) is expensive. These implementations maye use either dict-like instances of
:class:`fsspec.dircache.MemoryListingsCache` or file-based caching with instances of
:class:`fsspec.dircache.FileListingsCache` to manage the listings.

The listings cache can be controlled via the keyword ``listings_cache_options`` which is a dictionary.
The type of cache that is used, can be controlled via the keyword ``cache_type`` (`disabled`, `memory` or `file`).
The cache allows for time-based expiry of entries with the keyword ``expiry_time``. If the target location is known to
be volatile because e.g. it is being written to from other sources we recommend to disable the listings cache.
If you want to use the file-based caching, you can also provide the argument
``directory`` to determine where the cache file is stored.

Example for ``listings_cache_options``:

.. code-block:: json
{
"cache_type": "file",
"expiry_time": 3600,
"directory": "/tmp/cache"
}
When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache``
is called, so that subsequent listing of the given paths will force a refresh. In
Expand Down
12 changes: 6 additions & 6 deletions fsspec/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,19 @@ def _all_dirnames(self, paths):
def info(self, path, **kwargs):
self._get_dirs()
path = self._strip_protocol(path)
if path in {"", "/"} and self.dir_cache:
if path in {"", "/"} and self.listings_cache:
return {"name": "", "type": "directory", "size": 0}
if path in self.dir_cache:
return self.dir_cache[path]
elif path + "/" in self.dir_cache:
return self.dir_cache[path + "/"]
if path in self.listings_cache:
return self.listings_cache[path]
elif path + "/" in self.listings_cache:
return self.listings_cache[path + "/"]
else:
raise FileNotFoundError(path)

def ls(self, path, detail=True, **kwargs):
self._get_dirs()
paths = {}
for p, f in self.dir_cache.items():
for p, f in self.listings_cache.items():
p = p.rstrip("/")
if "/" in p:
root = p.rsplit("/", 1)[0]
Expand Down
12 changes: 10 additions & 2 deletions fsspec/asyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,15 +312,23 @@ class AsyncFileSystem(AbstractFileSystem):
mirror_sync_methods = True
disable_throttling = False

def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
def __init__(
self,
*args,
asynchronous=False,
loop=None,
batch_size=None,
listings_cache_options=None,
**kwargs,
):
self.asynchronous = asynchronous
self._pid = os.getpid()
if not asynchronous:
self._loop = loop or get_loop()
else:
self._loop = None
self.batch_size = batch_size
super().__init__(*args, **kwargs)
super().__init__(listings_cache_options, *args, **kwargs)

@property
def loop(self):
Expand Down
98 changes: 0 additions & 98 deletions fsspec/dircache.py

This file was deleted.

6 changes: 3 additions & 3 deletions fsspec/implementations/dbfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def ls(self, path, detail=True, **kwargs):
}
for o in files
]
self.dircache[path] = out
self.listings_cache[path] = out

if detail:
return out
Expand Down Expand Up @@ -380,9 +380,9 @@ def _get_data(self, path, start, end):

def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
self.listings_cache.clear()
else:
self.dircache.pop(path, None)
self.listings_cache.pop(path, None)
super().invalidate_cache(path)


Expand Down
10 changes: 5 additions & 5 deletions fsspec/implementations/ftp.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _get_kwargs_from_urls(urlpath):
def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
out = []
if path not in self.dircache:
if path not in self.listings_cache:
try:
try:
out = [
Expand All @@ -116,15 +116,15 @@ def ls(self, path, detail=True, **kwargs):
details["size"] = 0
if details["type"] == "dir":
details["type"] = "directory"
self.dircache[path] = out
self.listings_cache[path] = out
except Error:
try:
info = self.info(path)
if info["type"] == "file":
out = [(path, info)]
except (Error, IndexError):
raise FileNotFoundError(path)
files = self.dircache.get(path, out)
files = self.listings_cache.get(path, out)
if not detail:
return sorted([fn for fn, details in files])
return [details for fn, details in files]
Expand Down Expand Up @@ -252,9 +252,9 @@ def __del__(self):

def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
self.listings_cache.clear()
else:
self.dircache.pop(path, None)
self.listings_cache.pop(path, None)
super().invalidate_cache(path)


Expand Down
8 changes: 4 additions & 4 deletions fsspec/implementations/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
else:
return path
_sha = out["sha"]
if path not in self.dircache or sha not in [self.root, None]:
if path not in self.listings_cache or sha not in [self.root, None]:
r = requests.get(
self.url.format(org=self.org, repo=self.repo, sha=_sha),
timeout=self.timeout,
Expand All @@ -177,16 +177,16 @@ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
if f["type"] in types
]
if sha in [self.root, None]:
self.dircache[path] = out
self.listings_cache[path] = out
else:
out = self.dircache[path]
out = self.listings_cache[path]
if detail:
return out
else:
return sorted([f["name"] for f in out])

def invalidate_cache(self, path=None):
self.dircache.clear()
self.listings_cache.clear()

@classmethod
def _strip_protocol(cls, path):
Expand Down
Loading

0 comments on commit dc95a69

Please sign in to comment.