Skip to content

Commit

Permalink
[wip][performance] Use pread when possible to avoid read amplificatio…
Browse files Browse the repository at this point in the history
…n caused by buffering

 - [ ] THIS APPROACH IS DOOMED! The reason being that indexed_gzip,
   indexed_bzip2, indexed_zstd, and rapidgzip implement fileno members
   returning the file descriptor of the UNDERLYING FUCKING file, i.e.,
   the compressed file. This INVALIDATES MY ASSUMPTION that
   os.pread(file.fileno) == file.seek() + file.read()!!!
   Instead, implement a custom file reader class with a pread member,
   whose existence we can check for and/or inject such a pread member
   into the file object returned by Python's open builtin.
 - [ ] Add pread members to indexed_bzip2 and rapidgzip.
 - [ ] Add an extra commit before this that adds a buffer size argument
   like Python's open has, and then use pread for the special case of
   buffer size == 0. And only set this to 0 from the FuseMount class'
   open, so that ratarmountcoure MountSource.open usage does not loose
   the buffering!
 - [ ] Check at what locations pread is exactly necessary!
  • Loading branch information
mxmlnkn committed Sep 9, 2024
1 parent b0d2bc9 commit 29e3d17
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
17 changes: 16 additions & 1 deletion core/ratarmountcore/SQLiteIndexedTar.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,22 @@ def read(self, fileInfo: FileInfo, size: int, offset: int) -> bytes:
return file.read(size)

# For non-sparse files, we can simply seek to the offset and read from it.
self.tarFileObject.seek(tarFileInfo.offset + offset, os.SEEK_SET)
if not self.compression:
# TODO This still will fail when SQLiteIndexedTar is called with an IndexedGzipFile file object
# as input because fileno will return the file descriptor to the COMPRESSED FILE!
# Instead, add pread methods to file objects to avoid such a FUCKUP. See commit message.
fileno = None
if hasattr(os, 'pread'):
if hasattr(self.tarFileObject, 'fileno'):
try:
fileno = self.tarFileObject.fileno()
except Exception:
pass

if isinstance(fileno, int) and fileno >= 0:
return os.pread(fileno, size, tarFileInfo.offset + offset)

self.tarFileObject.seek(tarFileInfo.offset + offset, io.SEEK_SET)
return self.tarFileObject.read(size)

@overrides(MountSource)
Expand Down
41 changes: 39 additions & 2 deletions core/ratarmountcore/StenciledFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import bisect
import io
import os

from typing import Callable, IO, List, Optional, Tuple

Expand Down Expand Up @@ -151,11 +152,47 @@ def read(self, size: int = -1) -> bytes:
offsetInsideStencil = self.offset - self.cumsizes[i]
assert offsetInsideStencil >= 0
assert offsetInsideStencil < self.sizes[i]
self.fileObjects[i].seek(self.offsets[i] + offsetInsideStencil, io.SEEK_SET)
offset = self.offsets[i] + offsetInsideStencil

# Read as much as requested or as much as the current contiguous region / stencil still contains
readableSize = min(size, self.sizes[i] - (self.offset - self.cumsizes[i]))
tmp = self.fileObjects[i].read(readableSize)

fileObject = self.fileObjects[i]
fileObject.seek(offset, io.SEEK_SET)
tmp = fileObject.read(readableSize)

#fileno = None
#if hasattr(os, 'pread'):
# if hasattr(fileObject, 'fileno'):
# try:
# fileno = fileObject.fileno()
# except Exception:
# pass
#
#tmp = b''
#if isinstance(fileno, int) and fileno >= 0:
# # TODO Read in loop until it is done
# # TODO It seems that the recursive mounting test fails because pread(file.fileno)
# # returns compressed data while file.read returns uncompressed data!!! !!!
# # I think IndexedGzipFile is used!
# tmp = os.pread(fileno, readableSize, offset)
#
## TODO I do not know why, but some of the tests return nothing for pread...
## Maybe the files are so small that they get fully buffered and the file descriptor closed?
## Maybe some intricacy with tempfile?
## TODO Move this pread routine into some reusable function or class...
## TODO Avoid file lock?
##if not tmp:
#fileObject.seek(offset, io.SEEK_SET)
#tmp2 = fileObject.read(readableSize)
#if tmp and len(tmp) != len(tmp2):
# print("tmp:", len(tmp), "tmp2:", len(tmp2))
# print(tmp[:50])
# print(tmp2[:50])
# assert(tmp2.startswith(tmp))
##assert tmp == tmp2
#tmp = tmp2

self.offset += len(tmp)
result += tmp

Expand Down

0 comments on commit 29e3d17

Please sign in to comment.