Skip to content

Commit

Permalink
[wip][feature] Add support for fsspec backends
Browse files Browse the repository at this point in the history
  • Loading branch information
mxmlnkn committed Sep 22, 2024
1 parent f358cb9 commit 4763f34
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 2 deletions.
12 changes: 12 additions & 0 deletions AppImage/build-ratarmount-appimage.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@ function installAppImagePythonPackages()

"$APP_PYTHON_BIN" -I -m pip install --no-cache-dir ../core
"$APP_PYTHON_BIN" -I -m pip install --no-cache-dir ..[full]

# ratarmount-0.10.0-manylinux2014_x86_64.AppImage (the first one!) was 13.6 MB
# ratarmount-v0.11.3-manylinux2014_x86_64.AppImage was 13.6 MB
# ratarmount-0.12.0-manylinux2014_x86_64.AppImage was 26.3 MB thanks to an error with the trime-down script.
# ratarmount-0.15.0-x86_64.AppImage was 14.8 MB
# ratarmount-0.15.1-x86_64.AppImage was 13.3 MB (manylinux_2014)
# ratarmount-0.15.2-x86_64.AppImage was 11.7 MB (manylinux_2_28)
# At this point, with pyfatfs, the AppImage is/was 13.0 MB. Extracts to 45.1 MB
# This bloats the AppImage to 23.7 MB, which is still ok, I guess. Extracts to 83.1 MB
"$APP_PYTHON_BIN" -I -m pip install --no-cache-dir requests aiohttp paramiko smbprotocol pygit2 fsspec
# This bloats the AppImage to 38.5 MB :/. Extracts to 121.0 MB
"$APP_PYTHON_BIN" -I -m pip install --no-cache-dir s3fs gcsfs adlfs dropbox dropboxdrivefs
}

function installAppImageSystemLibraries()
Expand Down
30 changes: 30 additions & 0 deletions core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,39 @@ full = [
# Pin to < 3.12 because of https://github.com/nathanhi/pyfatfs/issues/41
'pyfatfs ~= 1.0; python_version < "3.12.0"',
"paramiko",
# fsspec:
"requests",
"aiohttp",
"paramiko",
"smbprotocol",
"pygit2",
"fsspec",
"s3fs",
"gcsfs",
"adlfs",
"dropbox",
"dropboxdrivefs",
]
bzip2 = ["rapidgzip >= 0.13.1"]
gzip = ["indexed_gzip >= 1.6.3, < 2.0"]
fsspec = [
# Copy-pasted from fsspec[full] list. Some were excluded because they are too unproportionally large.
"requests",
"aiohttp",
"paramiko",
"smbprotocol", # build error in Python 3.13
"pygit2", # build error in Python 3.13
"fsspec",
"s3fs",
"gcsfs",
"adlfs", # build error in Python 3.13
"dropbox",
"dropboxdrivefs",
# "dask", "distributed" : ~34 MB, ~10 MB gzip-compressed
# "pyarrow >= 1" : ~196 MB, ~60 MB gzip-compressed, build error in Python 3.13
# "ocifs" : ~350 MB
# "panel" : only for fsspec GUI
]
# Need >= 4.1 because of https://github.com/markokr/rarfile/issues/73
rar = ["rarfile ~= 4.1"]
# For now, only optional (and installed in the AppImage) because it is unstable and depends on many other packages
Expand Down
3 changes: 3 additions & 0 deletions core/ratarmountcore/SQLiteIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,9 @@ def reloadIndexReadOnly(self):
self.sqlConnection = SQLiteIndex._openSqlDb(f"file:{uriPath}?mode=ro", uri=True, check_same_thread=False)

def _reloadIndexOnDisk(self):
if self.printDebug >= 2:
print("[Info] Try to reopen SQLite database on disk at:", self.indexFilePath)
print("other index paths:", self.possibleIndexFilePaths)
if not self.indexFilePath or self.indexFilePath != ':memory:' or not self.sqlConnection:
return

Expand Down
41 changes: 41 additions & 0 deletions core/ratarmountcore/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
from .ZipMountSource import ZipMountSource
from .LibarchiveMountSource import LibarchiveMountSource

try:
import fsspec
except ImportError:
fsspec = None # type: ignore

try:
import paramiko
except ImportError:
Expand Down Expand Up @@ -148,6 +153,42 @@ def openMountSource(fileOrPath: Union[str, IO[bytes]], **options) -> MountSource
else:
return mountSource

if fsspec and protocol and protocol != 'file':
name = 'fsspec'
try:
if printDebug >= 3:
print(f"[Info] Try to open with {name}")

openFile = fsspec.open(path)
assert isinstance(openFile, fsspec.core.OpenFile)

# Note that http:// URLs are always files. Folders are only regex-parsed HTML files!
# By checking with isdir instead of isfile, we give isdir a higher precedence.
# TODO the filesystems are not uniform! http:// expects the arguments to isdir with prefixed
# protocol while other filesystem implementations are fine with only the path.
# https://github.com/ray-project/ray/issues/26423#issuecomment-1179561181
# Disable pylint errors. See https://github.com/fsspec/filesystem_spec/issues/1678
if openFile.fs.isdir(openFile.path): # pylint: disable=no-member
# TODO check if it is a filesystem or a file and create a MountSource if necessary.
raise Exception("Expected file.")

# This open call can fail with FileNotFoundError, IsADirectoryError, and probably others.
result = openFile.open() # pylint: disable=no-member
closeFileOnError = True

# Check that seeking works. May fail when, e.g., the HTTP server does not support range requests.
# Use https://github.com/danvk/RangeHTTPServer for testing purposes because
# "python3 -m http.server 9000" does not have range support. Use "python3 -m RangeHTTPServer 9000".
result.seek(1)
result.read(1)
result.seek(0)

fileOrPath = result
except Exception as exception:
if printDebug >= 2:
print(f"[Info] Trying to open with {name} raised an exception:", exception)
if printDebug >= 3:
traceback.print_exc()

joinedFileName = ''
if isinstance(fileOrPath, str):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ zip = ["ratarmountcore[zip]"]
zstd = ["ratarmountcore[zstd]"]
ssh = ["ratarmountcore[ssh]"]
squashfs = ["ratarmountcore[squashfs]"]
fsspec = ["ratarmountcore[fsspec]"]

[project.scripts]
ratarmount = "ratarmount:cli"
Expand Down
31 changes: 29 additions & 2 deletions ratarmount.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@
import zipfile
from typing import Any, Callable, Dict, Iterable, IO, List, Optional, Tuple, Union

# TODO Implement write-support for SSH and other fsspec overlays? May not make much sense for S3, HTTP
# May make sense for ftp, ssh, sftp, dropbox(?), google cloud(?), azure cloud(?)
# TODO Fix index saving when opening an fsspec URL. It should be saved to ~/.cache/ratarmount in that case.
# TODO Use Python fileobject.pread if available from indexed_bzip2 / rapidgzip backends
# TODO Do we still have Samba at TUD for testing?
# TODO Rapidgzip over HTTP is really SLOW with 24 threads (6 MB/s vs 200 MB/s without parallelization).
# I think that the random-like strided access with 128 KiB sizes is bad. Furthermore, it may also
# be better to increase the read block size to 1-16 MiB for HTTP and other cloud backends.
# Can I somehow communicate that to rapidgzip? Or can it be detected dynamically?
# Why not increase the read buffer size to the chunk size? The decompressed data makes up
# most of the memory usage anyway for reasonable compression ratios.

try:
from ratarmountcore.fusepy import fuse
Expand Down Expand Up @@ -56,6 +67,12 @@
except ImportError:
paramiko = None # type: ignore

try:
import fsspec
except ImportError:
fsspec = None # type: ignore


import ratarmountcore as core
from ratarmountcore import (
AutoMountLayer,
Expand Down Expand Up @@ -916,8 +933,18 @@ def checkInputFileType(
splitURI = tarFile.split('://')
if len(splitURI) > 1:
protocol = splitURI[0]
if protocol in ['ssh', 'sftp'] and paramiko is None:
raise argparse.ArgumentTypeError("Detected an URI, but paramiko was not found. Try: pip install paramiko")
if protocol in SFTPMountSource.PROTOCOLS:
if paramiko is None:
raise argparse.ArgumentTypeError("Detected an URI, but paramiko was not found. Try: pip install paramiko.")
else
if fsspec is None:
raise argparse.ArgumentTypeError("Detected an URI, but fsspec was not found. Try: pip install fsspec.")
if protocol not in fsspec.available_protocols():
raise argparse.ArgumentTypeError(
f"URI: {tarFile} uses an unknown protocol. Protocols known by fsspec are: "
+ ', '.join(fsspec.available_protocols())
)

return tarFile, None

if not os.path.isfile(tarFile):
Expand Down

0 comments on commit 4763f34

Please sign in to comment.