[wip][performance] Use pread when possible to avoid read amplificatio…

…n caused by buffering - [ ] THIS APPROACH IS DOOMED! The reason being that indexed_gzip, indexed_bzip2, indexed_zstd, and rapidgzip implement fileno members returning the file descriptor of the UNDERLYING FUCKING file, i.e., the compressed file. This INVALIDATES MY ASSUMPTION that os.pread(file.fileno) == file.seek() + file.read()!!! Instead, implement a custom file reader class with a pread member, whose existence we can check for and/or inject such a pread member into the file object returned by Python's open builtin. - [ ] Add pread members to indexed_bzip2 and rapidgzip. - [ ] Add an extra commit before this that adds a buffer size argument like Python's open has, and then use pread for the special case of buffer size == 0. And only set this to 0 from the FuseMount class' open, so that ratarmountcoure MountSource.open usage does not loose the buffering! - [ ] Check at what locations pread is exactly necessary!
mxmlnkn · Sep 9, 2024 · 29e3d17 · 29e3d17
1 parent b0d2bc9
commit 29e3d17
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 3 deletions.
diff --git a/core/ratarmountcore/SQLiteIndexedTar.py b/core/ratarmountcore/SQLiteIndexedTar.py
@@ -1163,7 +1163,22 @@ def read(self, fileInfo: FileInfo, size: int, offset: int) -> bytes:
                 return file.read(size)
 
         # For non-sparse files, we can simply seek to the offset and read from it.
-        self.tarFileObject.seek(tarFileInfo.offset + offset, os.SEEK_SET)
+        if not self.compression:
+            # TODO This still will fail when SQLiteIndexedTar is called with an IndexedGzipFile file object
+            #      as input because fileno will return the file descriptor to the COMPRESSED FILE!
+            #      Instead, add pread methods to file objects to avoid such a FUCKUP. See commit message.
+            fileno = None
+            if hasattr(os, 'pread'):
+                if hasattr(self.tarFileObject, 'fileno'):
+                    try:
+                        fileno = self.tarFileObject.fileno()
+                    except Exception:
+                        pass
+
+            if isinstance(fileno, int) and fileno >= 0:
+                return os.pread(fileno, size, tarFileInfo.offset + offset)
+
+        self.tarFileObject.seek(tarFileInfo.offset + offset, io.SEEK_SET)
         return self.tarFileObject.read(size)
 
     @overrides(MountSource)

diff --git a/core/ratarmountcore/StenciledFile.py b/core/ratarmountcore/StenciledFile.py
@@ -3,6 +3,7 @@
 
 import bisect
 import io
+import os
 
 from typing import Callable, IO, List, Optional, Tuple
 
@@ -151,11 +152,47 @@ def read(self, size: int = -1) -> bytes:
             offsetInsideStencil = self.offset - self.cumsizes[i]
             assert offsetInsideStencil >= 0
             assert offsetInsideStencil < self.sizes[i]
-            self.fileObjects[i].seek(self.offsets[i] + offsetInsideStencil, io.SEEK_SET)
+            offset = self.offsets[i] + offsetInsideStencil
 
             # Read as much as requested or as much as the current contiguous region / stencil still contains
             readableSize = min(size, self.sizes[i] - (self.offset - self.cumsizes[i]))
-            tmp = self.fileObjects[i].read(readableSize)
+
+            fileObject = self.fileObjects[i]
+            fileObject.seek(offset, io.SEEK_SET)
+            tmp = fileObject.read(readableSize)
+
+            #fileno = None
+            #if hasattr(os, 'pread'):
+            #    if hasattr(fileObject, 'fileno'):
+            #        try:
+            #            fileno = fileObject.fileno()
+            #        except Exception:
+            #            pass
+            #
+            #tmp = b''
+            #if isinstance(fileno, int) and fileno >= 0:
+            #    # TODO Read in loop until it is done
+            #    # TODO It seems that the recursive mounting test fails because pread(file.fileno)
+            #    #      returns compressed data while file.read returns uncompressed data!!! !!!
+            #    #      I think IndexedGzipFile is used!
+            #    tmp = os.pread(fileno, readableSize, offset)
+            #
+            ## TODO I do not know why, but some of the tests return nothing for pread...
+            ##      Maybe the files are so small that they get fully buffered and the file descriptor closed?
+            ##      Maybe some intricacy with tempfile?
+            ## TODO Move this pread routine into some reusable function or class...
+            ## TODO Avoid file lock?
+            ##if not tmp:
+            #fileObject.seek(offset, io.SEEK_SET)
+            #tmp2 = fileObject.read(readableSize)
+            #if tmp and len(tmp) != len(tmp2):
+            #    print("tmp:", len(tmp), "tmp2:", len(tmp2))
+            #    print(tmp[:50])
+            #    print(tmp2[:50])
+            #    assert(tmp2.startswith(tmp))
+            ##assert tmp == tmp2
+            #tmp = tmp2
+
             self.offset += len(tmp)
             result += tmp