Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DataFrameDiskCache locking #130

Merged
merged 1 commit into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions nix/packages.nix
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@
++ (with pp; [
beautifulsoup4
colorlog
filelock
graphviz
numpy
packageurl-python
Expand Down Expand Up @@ -226,6 +227,7 @@
(with ps; [
beautifulsoup4
colorlog
filelock
graphviz
numpy
packageurl-python
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def project_path(*names):
requires = [
"beautifulsoup4",
"colorlog",
"filelock",
"graphviz",
"numpy",
"pandas",
Expand Down
5 changes: 0 additions & 5 deletions src/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
import logging
import subprocess
import importlib.metadata
import pathlib
import tempfile
import urllib.error
from shutil import which

Expand All @@ -33,9 +31,6 @@
LOG_SPAM = logging.DEBUG - 1
LOG = logging.getLogger(os.path.abspath(__file__))

# DataFrameDiskCache cache local path
DFCACHE_PATH = pathlib.Path(tempfile.gettempdir()) / "sbomnix_df_cache"

###############################################################################


Expand Down
20 changes: 3 additions & 17 deletions src/sbomnix/cpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@

import sys
import string
import time
from sqlite3 import OperationalError
from dfdiskcache import DataFrameDiskCache
from sbomnix.dfcache import LockedDfCache
from common.utils import (
LOG,
LOG_SPAM,
DFCACHE_PATH,
df_from_csv_file,
df_log,
)
Expand All @@ -33,7 +30,7 @@ class CPE:
"""Generate Common Platform Enumeration identifiers"""

def __init__(self):
self.cache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH)
self.cache = LockedDfCache()
self.df_cpedict = self.cache.get(_CPE_CSV_URL)
if self.df_cpedict is not None and not self.df_cpedict.empty:
LOG.debug("read CPE dictionary from cache")
Expand All @@ -45,18 +42,7 @@ def __init__(self):
"Failed downloading cpedict: CPE information might not be accurate"
)
else:
waiting = True
while waiting:
try:
self.cache.set(
_CPE_CSV_URL, self.df_cpedict, ttl=_CPE_CSV_CACHE_TTL
)
waiting = False
except OperationalError:
LOG.warning(
"CPE Sqlite database is locked! Retrying in 1 second"
)
time.sleep(1)
self.cache.set(_CPE_CSV_URL, self.df_cpedict, ttl=_CPE_CSV_CACHE_TTL)

if self.df_cpedict is not None:
# Verify the loaded cpedict contains at least the following columns
Expand Down
50 changes: 50 additions & 0 deletions src/sbomnix/dfcache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: 2022-2024 Technology Innovation Institute (TII)
#
# SPDX-License-Identifier: Apache-2.0

# pylint: disable=too-few-public-methods

"""Thread-safe DataFrameDiskCache"""

import pathlib
import tempfile
from getpass import getuser

from filelock import FileLock
from dfdiskcache import DataFrameDiskCache

###############################################################################

# DataFrameDiskCache cache local path and lock file
DFCACHE_PATH = pathlib.Path(tempfile.gettempdir()) / f"{getuser()}_sbomnix_df_cache"
DFCACHE_LOCK = DFCACHE_PATH / "dfcache.lock"

################################################################################


class LockedDfCache:
"""Thread-safe (and process-safe) wrapper for DataFrameDiskCache"""

def __init__(self):
self.dflock = FileLock(DFCACHE_LOCK)

def __getattr__(self, name):

def wrap(*a, **k):
with self.dflock:
# We intentionally do not store the dfcache as object variable
# but re-instantiate it every time any LockedDfCache method
# is called. DataFrameDiskCache internally makes use of sqlite
# which does not allow concurrent connections to the database.
# Having the dfcache initiated once in __init__() and then
# re-used here would mean the connection would remain reserved
# for the first thread making other threads throw with
# 'database locked' etc. even if we otherwise protect
# concurrent writes.
dfcache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH)
return getattr(dfcache, name)(*a, **k)

return wrap


###############################################################################
65 changes: 37 additions & 28 deletions src/sbomnix/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,18 @@

"""Cache nixpkgs meta information"""

import time
import os
import re
import logging
import pathlib
import tempfile
from getpass import getuser

from sqlite3 import OperationalError
import pandas as pd
from dfdiskcache import DataFrameDiskCache
from filelock import FileLock
from sbomnix.dfcache import LockedDfCache
from nixmeta.scanner import NixMetaScanner, nixref_to_nixpkgs_path
from common.utils import LOG, df_from_csv_file, df_to_csv_file, DFCACHE_PATH
from common.utils import LOG, df_from_csv_file, df_to_csv_file

###############################################################################

Expand All @@ -28,22 +30,18 @@
# is cleaned.
_NIXMETA_NIXPKGS_TTL = 60 * 60 * 24 * 30

# FileLock lock path
_FLOCK = pathlib.Path(tempfile.gettempdir()) / f"{getuser()}_sbomnix_meta.lock"

###############################################################################


class Meta:
"""Cache nixpkgs meta information"""

def __init__(self):
waiting = True
while waiting:
try:
self.cache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH)
waiting = False
except OperationalError:
LOG.warning("DFCACHE Sqlite database is locked! Retrying in 1 second")
time.sleep(1)

self.lock = FileLock(_FLOCK)
self.cache = LockedDfCache()
# df_nixmeta includes the meta-info from _NIXMETA_CSV_URL
self.df_nixmeta = self.cache.get(_NIXMETA_CSV_URL)
if self.df_nixmeta is not None and not self.df_nixmeta.empty:
Expand Down Expand Up @@ -103,22 +101,33 @@ def get_nixpkgs_meta(self, nixref=None):
return df_concat

def _scan(self, nixpkgs_path):
df = self.cache.get(nixpkgs_path)
if df is not None and not df.empty:
LOG.debug("found from cache: %s", nixpkgs_path)
# In case sbomnix is run concurrently, we want to make sure there's
# only one instance of NixMetaScanner.scan() running at a time.
# The reason is, NixMetaScanner.scan() potentially invokes
# `nix-env -qa --meta --json -f /path/to/nixpkgs` which is very
# memory intensive. The locking needs to happen here (and not in
# NixMetaScanner) because this object caches the nixmeta info.
# First scan generates the cache, after which the consecutive scans
# will read the scan results from the cache, not having to run
# the nix-env command again, making the consecutive scans relatively
# fast and light-weight.
with self.lock:
df = self.cache.get(nixpkgs_path)
if df is not None and not df.empty:
LOG.debug("found from cache: %s", nixpkgs_path)
return df
LOG.debug("cache miss, scanning: %s", nixpkgs_path)
scanner = NixMetaScanner()
scanner.scan(nixpkgs_path)
df = scanner.to_df()
if df is None or df.empty:
LOG.warning("Failed scanning nixmeta: %s", nixpkgs_path)
return None
# Cache requires some TTL, so we set it to some value here.
# Although, we could as well store it indefinitely as it should
# not change given the same key (nixpkgs store path).
self.cache.set(key=nixpkgs_path, value=df, ttl=_NIXMETA_NIXPKGS_TTL)
return df
LOG.debug("cache miss, scanning: %s", nixpkgs_path)
scanner = NixMetaScanner()
scanner.scan(nixpkgs_path)
df = scanner.to_df()
if df is None or df.empty:
LOG.warning("Failed scanning nixmeta: %s", nixpkgs_path)
return None
# Cache requires some TTL, so we set it to some value here.
# Although, we could as well store it indefinitely as it should
# not change given the same key (nixpkgs store path).
self.cache.set(key=nixpkgs_path, value=df, ttl=_NIXMETA_NIXPKGS_TTL)
return df


###############################################################################
Loading