diff --git a/cache/cache.cc b/cache/cache.cc index a65f5ec4f8..13ddf4b772 100644 --- a/cache/cache.cc +++ b/cache/cache.cc @@ -155,4 +155,54 @@ void Cache::SetEvictionCallback(EvictionCallback&& fn) { eviction_callback_ = std::move(fn); } +// ================================================================================================================================== +Cache::ItemOwnerId Cache::ItemOwnerIdAllocator::Allocate() { + // In practice, onwer-ids are allocated and freed when cf-s + // are created and destroyed => relatively rare => paying + // the price to always lock the mutex and simplify the code + std::lock_guard lock(free_ids_mutex_); + + // First allocate from the free list if possible + if (free_ids_.empty() == false) { + auto allocated_id = free_ids_.front(); + free_ids_.pop_front(); + return allocated_id; + } + + // Nothing on the free list - try to allocate from the + // next item counter if not yet exhausted + if (has_wrapped_around_) { + // counter exhausted, allocation not possible + return kUnknownItemId; + } + + auto allocated_id = next_item_owner_id_++; + + if (allocated_id == kMaxOwnerItemId) { + has_wrapped_around_ = true; + } + + return allocated_id; +} + +void Cache::ItemOwnerIdAllocator::Free(ItemOwnerId* id) { + if (*id != kUnknownItemId) { + std::lock_guard lock(free_ids_mutex_); + // The freed id is lost but this is a luxury feature. We can't + // pay too much space to support it. + if (free_ids_.size() < kMaxFreeItemOwnersIdListSize) { + free_ids_.push_back(*id); + } + *id = kUnknownItemId; + } +} + +Cache::ItemOwnerId Cache::GetNextItemOwnerId() { + return owner_id_allocator_.Allocate(); +} + +void Cache::DiscardItemOwnerId(ItemOwnerId* item_owner_id) { + owner_id_allocator_.Free(item_owner_id); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_entry_roles.cc b/cache/cache_entry_roles.cc index f83ada2313..e8adab412e 100644 --- a/cache/cache_entry_roles.cc +++ b/cache/cache_entry_roles.cc @@ -101,4 +101,19 @@ std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) { return GetPrefixedCacheEntryRoleName(kPrefix, role); } +const std::string& BlockCacheCfStatsMapKeys::CfName() { + static const std::string kCfName = "cf_name"; + return kCfName; +} + +const std::string& BlockCacheCfStatsMapKeys::CacheId() { + static const std::string kCacheId = "id"; + return kCacheId; +} + +std::string BlockCacheCfStatsMapKeys::UsedBytes(CacheEntryRole role) { + const static std::string kPrefix = "bytes."; + return GetPrefixedCacheEntryRoleName(kPrefix, role); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h index 9968995da9..c67ea6b75d 100644 --- a/cache/cache_entry_stats.h +++ b/cache/cache_entry_stats.h @@ -83,7 +83,8 @@ class CacheEntryStatsCollector { last_start_time_micros_ = start_time_micros; working_stats_.BeginCollection(cache_, clock_, start_time_micros); - cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {}); + cache_->ApplyToAllEntriesWithOwnerId(working_stats_.GetEntryCallback(), + {}); TEST_SYNC_POINT_CALLBACK( "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr); diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 12be0babef..fcb87a24ac 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -1056,10 +1056,11 @@ void ClockCacheShard::EraseUnRefEntries() { } template -void ClockCacheShard
::ApplyToSomeEntries( +void ClockCacheShard
::ApplyToSomeEntriesWithOwnerId( const std::function& callback, + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id)>& callback, size_t average_entries_per_lock, size_t* state) { // The state is essentially going to be the starting hash, which works // nicely even if we resize between calls because we use upper-most @@ -1086,7 +1087,7 @@ void ClockCacheShard
::ApplyToSomeEntries( [callback](const HandleImpl& h) { UniqueId64x2 unhashed; callback(ReverseHash(h.hashed_key, &unhashed), h.value, - h.GetTotalCharge(), h.helper); + h.GetTotalCharge(), h.helper, Cache::kUnknownItemId); }, index_begin, index_end, false); } @@ -1134,6 +1135,16 @@ Status ClockCacheShard
::Insert(const Slice& key, const Cache::CacheItemHelper* helper, size_t charge, HandleImpl** handle, Cache::Priority priority) { + return InsertWithOwnerId(key, hashed_key, value, helper, charge, + Cache::kUnknownItemId, handle, priority); +} + +template +Status ClockCacheShard
::InsertWithOwnerId( + const Slice& key, const UniqueId64x2& hashed_key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::ItemOwnerId /* item_owner_id */, HandleImpl** handle, + Cache::Priority priority) { if (UNLIKELY(key.size() != kCacheKeySize)) { return Status::NotSupported("ClockCache only supports key size " + std::to_string(kCacheKeySize) + "B"); diff --git a/cache/clock_cache.h b/cache/clock_cache.h index fc5aef6cb4..48c82296d1 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -614,6 +614,12 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, HandleImpl** handle, Cache::Priority priority); + Status InsertWithOwnerId(const Slice& key, const UniqueId64x2& hashed_key, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::ItemOwnerId /* item_owner_id */, + HandleImpl** handle, Cache::Priority priority); + HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key, Cache::ObjectPtr obj, const Cache::CacheItemHelper* helper, @@ -643,10 +649,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { size_t GetTableAddressCount() const; - void ApplyToSomeEntries( - const std::function& callback, + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id)>& callback, size_t average_entries_per_lock, size_t* state); void EraseUnRefEntries(); diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 3b4e80ef87..6a1eb46eff 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -165,10 +165,11 @@ void LRUCacheShard::EraseUnRefEntries() { } } -void LRUCacheShard::ApplyToSomeEntries( +void LRUCacheShard::ApplyToSomeEntriesWithOwnerId( const std::function& callback, + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id)>& callback, size_t average_entries_per_lock, size_t* state) { // The state is essentially going to be the starting hash, which works // nicely even if we resize between calls because we use upper-most @@ -196,7 +197,7 @@ void LRUCacheShard::ApplyToSomeEntries( [callback, metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) { callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), - h->helper); + h->helper, h->item_owner_id); }, index_begin, index_end); } @@ -518,7 +519,8 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/, LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, - size_t charge) { + size_t charge, + Cache::ItemOwnerId item_owner_id) { assert(helper); // value == nullptr is reserved for indicating failure in SecondaryCache assert(!(helper->IsSecondaryCacheCompatible() && value == nullptr)); @@ -539,7 +541,7 @@ LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash, e->next = e->prev = nullptr; memcpy(e->key_data, key.data(), key.size()); e->CalcTotalCharge(charge, metadata_charge_policy_); - + e->item_owner_id = item_owner_id; return e; } @@ -548,7 +550,18 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, const Cache::CacheItemHelper* helper, size_t charge, LRUHandle** handle, Cache::Priority priority) { - LRUHandle* e = CreateHandle(key, hash, value, helper, charge); + return InsertWithOwnerId(key, hash, value, helper, charge, + Cache::kUnknownItemId, handle, priority); +} + +Status LRUCacheShard::InsertWithOwnerId(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + size_t charge, + Cache::ItemOwnerId item_owner_id, + LRUHandle** handle, + Cache::Priority priority) { + LRUHandle* e = CreateHandle(key, hash, value, helper, charge, item_owner_id); e->SetPriority(priority); e->SetInCache(true); return InsertItem(e, handle); @@ -559,7 +572,8 @@ LRUHandle* LRUCacheShard::CreateStandalone(const Slice& key, uint32_t hash, const Cache::CacheItemHelper* helper, size_t charge, bool allow_uncharged) { - LRUHandle* e = CreateHandle(key, hash, value, helper, charge); + LRUHandle* e = + CreateHandle(key, hash, value, helper, charge, Cache::kUnknownItemId); e->SetIsStandalone(true); e->Ref(); diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 554907b3be..725aad18d3 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -59,6 +59,7 @@ struct LRUHandle { uint32_t hash; // The number of external refs to this entry. The cache itself is not counted. uint32_t refs; + Cache::ItemOwnerId item_owner_id = Cache::kUnknownItemId; // Mutable flags - access controlled by mutex // The m_ and M_ prefixes (and im_ and IM_ later) are to hopefully avoid @@ -302,6 +303,12 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { const Cache::CacheItemHelper* helper, size_t charge, LRUHandle** handle, Cache::Priority priority); + Status InsertWithOwnerId(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::ItemOwnerId /* item_owner_id */, + LRUHandle** handle, Cache::Priority priority); + LRUHandle* CreateStandalone(const Slice& key, uint32_t hash, Cache::ObjectPtr obj, const Cache::CacheItemHelper* helper, @@ -325,10 +332,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { size_t GetOccupancyCount() const; size_t GetTableAddressCount() const; - void ApplyToSomeEntries( + void ApplyToSomeEntriesWithOwnerId( const std::function& callback, + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id)>& callback, size_t average_entries_per_lock, size_t* state); void EraseUnRefEntries(); @@ -373,7 +381,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { LRUHandle* CreateHandle(const Slice& key, uint32_t hash, Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper, size_t charge); + const Cache::CacheItemHelper* helper, size_t charge, + Cache::ItemOwnerId item_owner_id); // Initialized before use. size_t capacity_; diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index c8eb58aad5..c8ad2d5ceb 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -174,11 +174,19 @@ class ShardedCache : public ShardedCacheBase { Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW) override { + return InsertWithOwnerId(key, obj, helper, charge, kUnknownItemId, handle, + priority); + } + + Status InsertWithOwnerId(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + ItemOwnerId item_owner_id, Handle** handle = nullptr, + Priority priority = Priority::LOW) override { assert(helper); HashVal hash = CacheShard::ComputeHash(key); auto h_out = reinterpret_cast(handle); - return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out, - priority); + return GetShard(hash).InsertWithOwnerId(key, hash, obj, helper, charge, + item_owner_id, h_out, priority); } Handle* CreateStandalone(const Slice& key, ObjectPtr obj, @@ -235,6 +243,22 @@ class ShardedCache : public ShardedCacheBase { const std::function& callback, const ApplyToAllEntriesOptions& opts) override { + auto callback_with_owner_id = + [&callback](const Slice& key, ObjectPtr obj, size_t charge, + const CacheItemHelper* helper, + Cache::ItemOwnerId /* item_owner_id */) { + callback(key, obj, charge, helper); + }; + + ApplyToAllEntriesWithOwnerId(callback_with_owner_id, opts); + } + + void ApplyToAllEntriesWithOwnerId( + const std::function& + callback_with_owner_id, + const ApplyToAllEntriesOptions& opts) override { uint32_t num_shards = GetNumShards(); // Iterate over part of each shard, rotating between shards, to // minimize impact on latency of concurrent operations. @@ -248,7 +272,8 @@ class ShardedCache : public ShardedCacheBase { remaining_work = false; for (uint32_t i = 0; i < num_shards; i++) { if (states[i] != SIZE_MAX) { - shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]); + shards_[i].ApplyToSomeEntriesWithOwnerId(callback_with_owner_id, aepl, + &states[i]); remaining_work |= states[i] != SIZE_MAX; } } diff --git a/cache/typed_cache.h b/cache/typed_cache.h index e42aa4c260..2cd20db573 100644 --- a/cache/typed_cache.h +++ b/cache/typed_cache.h @@ -301,13 +301,18 @@ class FullTypedCacheInterface inline Status InsertFull( const Slice& key, TValuePtr value, size_t charge, TypedHandle** handle = nullptr, Priority priority = Priority::LOW, - CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier, + Cache::ItemOwnerId item_owner_id = Cache::kUnknownItemId) { auto untyped_handle = reinterpret_cast(handle); auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier ? GetFullHelper() : GetBasicHelper(); - return this->cache_->Insert(key, UpCastValue(value), helper, charge, - untyped_handle, priority); + + // fprintf(stderr, "InsertFull: owner_id:%d, charge:%d\n", + // (int)item_owner_id, (int)charge); + return this->cache_->InsertWithOwnerId(key, UpCastValue(value), helper, + charge, item_owner_id, + untyped_handle, priority); } // Like SecondaryCache::InsertSaved, with SecondaryCache compatibility diff --git a/db/column_family.cc b/db/column_family.cc index 645d2e7d62..d61bdf133e 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -649,6 +649,11 @@ ColumnFamilyData::ColumnFamilyData( CacheReservationManagerImpl>( bbto->block_cache))); } + + if (bbto->block_cache && table_cache_) { + cache_owner_id_ = bbto->block_cache->GetNextItemOwnerId(); + table_cache_->SetBlockCacheOwnerId(cache_owner_id_); + } } } @@ -662,6 +667,12 @@ ColumnFamilyData::~ColumnFamilyData() { prev->next_ = next; next->prev_ = prev; + const BlockBasedTableOptions* bbto = + ioptions_.table_factory->GetOptions(); + if (bbto && bbto->block_cache) { + bbto->block_cache->DiscardItemOwnerId(&cache_owner_id_); + } + if (!dropped_ && column_family_set_ != nullptr) { // If it's dropped, it's already removed from column family set // If column_family_set_ == nullptr, this is dummy CFD and not in diff --git a/db/column_family.h b/db/column_family.h index da26489442..ab16a53d44 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -563,6 +563,8 @@ class ColumnFamilyData { static constexpr uint64_t kLaggingFlushesThreshold = 10U; void SetNumTimedQueuedForFlush(uint64_t num) { num_queued_for_flush_ = num; } + Cache::ItemOwnerId GetCacheOwnerId() const { return cache_owner_id_; } + // Allocate and return a new epoch number uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); } @@ -688,6 +690,8 @@ class ColumnFamilyData { uint64_t num_queued_for_flush_ = 0U; std::atomic next_epoch_number_; + + Cache::ItemOwnerId cache_owner_id_ = Cache::kUnknownItemId; }; // ColumnFamilySet has interesting thread-safety requirements diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 50eb682201..a878baa5d4 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -33,6 +33,14 @@ #include "util/random.h" #include "utilities/fault_injection_fs.h" +#ifdef CALL_WRAPPER +#undef CALL_WRAPPER +#endif + +#define CALL_WRAPPER(func) \ + func; \ + ASSERT_FALSE(HasFailure()); + namespace ROCKSDB_NAMESPACE { class DBBlockCacheTest : public DBTestBase { @@ -145,6 +153,95 @@ class DBBlockCacheTest : public DBTestBase { } return cache_entry_role_counts; } + + bool IsLRUCache(Cache* cache) { + return (std::string(cache->Name()) == "LRUCache"); + } + + InternalStats::CacheEntryRoleStats GetCacheEntryRoleStatsBg() { + // Verify in cache entry role stats + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats(); + InternalStats::CacheEntryRoleStats stats; + internal_stats_ptr->TEST_GetCacheEntryRoleStats(&stats, + /*foreground=*/false); + return stats; + } + + void ValidateCacheCfMapProperty( + const std::vector& cf_handles, + const InternalStats::CacheEntryRoleStats& actual_stats) { + // Get the general block cache entry stats using the default cf as + // we are using only the total used bytes which is the total for all + // cf-s in this DB + std::map entry_values; + ASSERT_TRUE(db_->GetMapProperty(dbfull()->DefaultColumnFamily(), + DB::Properties::kBlockCacheEntryStats, + &entry_values)); + for (auto role : {CacheEntryRole::kDataBlock, CacheEntryRole::kFilterBlock, + CacheEntryRole::kIndexBlock}) { + uint64_t total_role_charges_all_cfs_cf_stats = 0U; + + for (const auto cf_handle : cf_handles) { + ColumnFamilyHandleImpl* cfh = + static_cast(cf_handle); + + std::map cf_values; + ASSERT_TRUE(db_->GetMapProperty(cfh, DB::Properties::kBlockCacheCfStats, + &cf_values)); + + ASSERT_EQ(cfh->GetName(), + cf_values[BlockCacheCfStatsMapKeys::CfName()]); + ASSERT_EQ(actual_stats.cache_id, + cf_values[BlockCacheCfStatsMapKeys::CacheId()]); + + total_role_charges_all_cfs_cf_stats += + std::stoll(cf_values[BlockCacheCfStatsMapKeys::UsedBytes(role)]); + } + + auto total_role_charges_global_stats = + std::stoll(entry_values[BlockCacheCfStatsMapKeys::UsedBytes(role)]); + ASSERT_EQ(total_role_charges_global_stats, + total_role_charges_all_cfs_cf_stats) + << "Role: " << GetCacheEntryRoleName(role); + } + } + + void ValidateCacheStats( + const std::shared_ptr& cache, + const std::array& expected_counts) { + auto actual_stats = GetCacheEntryRoleStatsBg(); + + auto actual_counts = actual_stats.entry_counts; + EXPECT_EQ(expected_counts, actual_counts); + + std::vector cf_handles(handles_); + if (cf_handles.empty()) { + cf_handles.push_back(dbfull()->DefaultColumnFamily()); + }; + + if (IsLRUCache(cache.get())) { + // For LRU block cache, verify that the per-item owner id counts + // are maintained correctly. + // This feature is currently only supported in the LRU cache + for (auto role : + {CacheEntryRole::kDataBlock, CacheEntryRole::kFilterBlock, + CacheEntryRole::kIndexBlock}) { + auto role_idx = static_cast(role); + size_t total_role_charges_all_cfs = 0U; + for (const auto cfh : cf_handles) { + auto cfh_impl = static_cast(cfh); + auto cache_owner_id = cfh_impl->cfd()->GetCacheOwnerId(); + total_role_charges_all_cfs += + actual_stats.charge_per_item_owner[cache_owner_id][role_idx]; + } + ASSERT_EQ(actual_stats.total_charges[role_idx], + total_role_charges_all_cfs); + } + ValidateCacheCfMapProperty(cf_handles, actual_stats); + } + } }; TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) { @@ -629,12 +726,21 @@ class MockCache : public LRUCache { Status Insert(const Slice& key, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, Handle** handle, Priority priority) override { + return InsertWithOwnerId(key, value, helper, charge, Cache::kUnknownItemId, + handle, priority); + } + + Status InsertWithOwnerId(const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::ItemOwnerId item_owner_id, Handle** handle, + Priority priority) override { if (priority == Priority::LOW) { low_pri_insert_count++; } else { high_pri_insert_count++; } - return LRUCache::Insert(key, value, helper, charge, handle, priority); + return LRUCache::InsertWithOwnerId(key, value, helper, charge, + item_owner_id, handle, priority); } }; @@ -958,18 +1064,23 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { } } -static void ClearCache(Cache* cache) { +static void ClearCache(Cache* cache, Cache::ItemOwnerId owner_id_to_clear = + Cache::kUnknownItemId) { std::deque keys; Cache::ApplyToAllEntriesOptions opts; auto callback = [&](const Slice& key, Cache::ObjectPtr, size_t /*charge*/, - const Cache::CacheItemHelper* helper) { + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id) { if (helper && helper->role == CacheEntryRole::kMisc) { // Keep the stats collector return; } - keys.push_back(key.ToString()); + if ((owner_id_to_clear == Cache::kUnknownItemId) || + (item_owner_id == owner_id_to_clear)) { + keys.push_back(key.ToString()); + } }; - cache->ApplyToAllEntries(callback, opts); + cache->ApplyToAllEntriesWithOwnerId(callback, opts); for (auto& k : keys) { cache->Erase(k); } @@ -1031,6 +1142,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { // For CacheEntryStatsCollector expected[static_cast(CacheEntryRole::kMisc)] = 1; EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); std::array prev_expected = expected; @@ -1042,12 +1154,15 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { } // Within some time window, we will get cached entry stats EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, prev_expected)); // Not enough to force a miss env_->MockSleepForSeconds(45); EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, prev_expected)); // Enough to force a miss env_->MockSleepForSeconds(601); EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); // Now access index and data block ASSERT_EQ("value", Get("foo")); @@ -1070,6 +1185,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { }); SyncPoint::GetInstance()->EnableProcessing(); EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); prev_expected = expected; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -1086,9 +1202,11 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { // a miss env_->MockSleepForSeconds(601); EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, prev_expected)); // But this is enough env_->MockSleepForSeconds(10000); EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); prev_expected = expected; // Also check the GetProperty interface @@ -1102,6 +1220,27 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { values[BlockCacheEntryStatsMapKeys::EntryCount(role)]); } + // Also check the GetProperty interface for CF Stats + std::map cf_values; + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheCfStats, &cf_values)); + + // We have a single CF ("default") => Validate accordingly for the cf + // stats + ASSERT_EQ("default", cf_values[BlockCacheCfStatsMapKeys::CfName()]); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + auto role = static_cast(i); + + if (IsLRUCache(cache.get())) { + ASSERT_EQ(values[BlockCacheEntryStatsMapKeys::UsedBytes(role)], + cf_values[BlockCacheCfStatsMapKeys::UsedBytes(role)]); + } else { + // CF Stats currently supported only for LRU Cache => + // Otherwise, the cf stats used counts are expected to be 0 + ASSERT_EQ("0", cf_values[BlockCacheCfStatsMapKeys::UsedBytes(role)]); + } + } + // Add one for kWriteBuffer { WriteBufferManager wbm(size_t{1} << 20, cache); @@ -1149,9 +1288,11 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { expected[static_cast(CacheEntryRole::kMisc)]++; // Still able to hit on saved stats EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, prev_expected)); // Enough to force a miss env_->MockSleepForSeconds(1000); EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); cache->Release(h); @@ -1216,6 +1357,209 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { } } +TEST_F(DBBlockCacheTest, CacheStatsPerCfMultipleCfs) { + const size_t capacity = size_t{1} << 25; + auto cache{NewLRUCache(capacity)}; + + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_open_files = 13; + options.table_cache_numshardbits = 0; + // If this wakes up, it could interfere with test + options.stats_dump_period_sec = 0; + + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(50)); + table_options.metadata_cache_options.top_level_index_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.partition_pinning = PinningTier::kNone; + table_options.metadata_cache_options.unpartitioned_pinning = + PinningTier::kNone; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"CF1"}, options); + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + ASSERT_OK(Put(1, "zfoo", "value")); + ASSERT_OK(Put(1, "zbar", "value")); + ASSERT_OK(Flush(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(0, 1)); + + // Fresh cache + ClearCache(cache.get()); + + std::array expected{}; + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + CALL_WRAPPER(ValidateCacheStats(cache, expected)); + + // First access only filters + ASSERT_EQ("NOT_FOUND", Get("different from any key added")); + ASSERT_EQ("NOT_FOUND", Get(1, "different from any key added")); + expected[static_cast(CacheEntryRole::kFilterBlock)] += 2; + // Enough to force a miss + env_->MockSleepForSeconds(601); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); + + // Now access index and data block + ASSERT_EQ("value", Get("foo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Enough to force a miss + env_->MockSleepForSeconds(601); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); + + // The same for other CF + ASSERT_EQ("value", Get(1, "zfoo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + expected[static_cast(CacheEntryRole::kDataBlock)]++; + env_->MockSleepForSeconds(601); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); + + auto cf1_owner_id = static_cast(handles_[1]) + ->cfd() + ->GetCacheOwnerId(); + + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_.erase(handles_.begin() + 1); + + --expected[static_cast(CacheEntryRole::kFilterBlock)]; + --expected[static_cast(CacheEntryRole::kIndexBlock)]; + --expected[static_cast(CacheEntryRole::kDataBlock)]; + + // The cache may have items of CF1 in its LRU which will + // be counted => remove them explicitly + ClearCache(cache.get(), cf1_owner_id); + + env_->MockSleepForSeconds(601); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); + + ClearCache(cache.get()); + std::fill(expected.begin(), expected.end(), 0); + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + env_->MockSleepForSeconds(601); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); + + // Add some more CF-2 + CreateColumnFamilies({"CF2", "CF3", "CF4"}, options); + + for (auto cf_id = 1U; cf_id < 4U; ++cf_id) { + ASSERT_OK(Put(cf_id, std::string("CF") + std::to_string(cf_id) + "-foo", + "value")); + ASSERT_OK(Flush(cf_id)); + ASSERT_EQ(1, NumTableFilesAtLevel(0, 1)); + } + + // Fresh cache + ClearCache(cache.get()); + + ASSERT_EQ("NOT_FOUND", Get(1, "different from any key added")); + expected[static_cast(CacheEntryRole::kFilterBlock)] += 1; + + ASSERT_EQ("value", Get(2, "CF2-foo")); + expected[static_cast(CacheEntryRole::kFilterBlock)]++; + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + expected[static_cast(CacheEntryRole::kDataBlock)]++; + + env_->MockSleepForSeconds(601); + CALL_WRAPPER(ValidateCacheStats(cache, expected)); +} + +TEST_F(DBBlockCacheTest, ItemIdAllocation) { + const size_t capacity = size_t{1} << 25; + auto cache{NewLRUCache(capacity)}; + + size_t max_num_ids = Cache::kMaxOwnerItemId - Cache::kMinOwnerItemId + 1; + auto expected_num_free_ids = max_num_ids; + + // Allocate 10 id-s + auto expected_next_id = Cache::kMinOwnerItemId; + for (auto i = 0U; i < 10U; ++i) { + ASSERT_EQ(cache->GetNextItemOwnerId(), expected_next_id); + ++expected_next_id; + --expected_num_free_ids; + } + --expected_next_id; + + // Release all 10 allocated id-s in reverse order + Cache::ItemOwnerId to_discard_id = expected_next_id; + for (auto i = 0U; i < 10U; ++i) { + auto temp = to_discard_id; + cache->DiscardItemOwnerId(&temp); + ASSERT_EQ(temp, Cache::kUnknownItemId); + + ASSERT_GT(to_discard_id, 0U); + --to_discard_id; + ++expected_num_free_ids; + } + + // Allocate 10 id-s and expect to get the id-s from the free list + // in the reverse order + ASSERT_EQ(expected_next_id, Cache::kMinOwnerItemId + 9U); + for (auto i = 0U; i < 10U; ++i) { + ASSERT_EQ(cache->GetNextItemOwnerId(), expected_next_id); + ASSERT_GT(expected_next_id, 0U); + --expected_next_id; + --expected_num_free_ids; + } + + ASSERT_EQ(expected_num_free_ids, max_num_ids - 10U); + + // Free list should now be empty + // Exhaust all of the id-s before wrap around + expected_next_id = Cache::kMinOwnerItemId + 10U; + while (expected_num_free_ids > 0U) { + ASSERT_EQ(cache->GetNextItemOwnerId(), expected_next_id); + ++expected_next_id; + --expected_num_free_ids; + } + + // Expecting next allocations to fail + for (auto i = 0U; i < 5U; ++i) { + ASSERT_EQ(cache->GetNextItemOwnerId(), Cache::kUnknownItemId); + } + + // Free some arbitrary id-s + Cache::ItemOwnerId owner_id = 5000U; + cache->DiscardItemOwnerId(&owner_id); + owner_id = 1000; + cache->DiscardItemOwnerId(&owner_id); + owner_id = 3000; + cache->DiscardItemOwnerId(&owner_id); + + // Expect allocations to return id-s in the same order as freed + ASSERT_EQ(cache->GetNextItemOwnerId(), 5000); + ASSERT_EQ(cache->GetNextItemOwnerId(), 1000); + ASSERT_EQ(cache->GetNextItemOwnerId(), 3000); + + // All id-s exhausted again + ASSERT_EQ(cache->GetNextItemOwnerId(), Cache::kUnknownItemId); + + // Verify the max size of the free list + for (auto i = 0U; i < 2 * Cache::kMaxFreeItemOwnersIdListSize; ++i) { + owner_id = Cache::kMinOwnerItemId + i; + cache->DiscardItemOwnerId(&owner_id); + } + + for (auto i = 0U; i < Cache::kMaxFreeItemOwnersIdListSize; ++i) { + ASSERT_EQ(cache->GetNextItemOwnerId(), Cache::kMinOwnerItemId + i); + } + + // All id-s exhausted again + ASSERT_EQ(cache->GetNextItemOwnerId(), Cache::kUnknownItemId); +} + namespace { void DummyFillCache(Cache& cache, size_t entry_size, diff --git a/db/internal_stats.cc b/db/internal_stats.cc index dda28aeb1f..e7baaf50eb 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -255,6 +255,9 @@ static const std::string levelstats = "levelstats"; static const std::string block_cache_entry_stats = "block-cache-entry-stats"; static const std::string fast_block_cache_entry_stats = "fast-block-cache-entry-stats"; +static const std::string block_cache_cf_stats = "block-cache-cf-stats"; +static const std::string fast_block_cache_cf_stats = + "fast-block-cache-cf-stats"; static const std::string num_immutable_mem_table = "num-immutable-mem-table"; static const std::string num_immutable_mem_table_flushed = "num-immutable-mem-table-flushed"; @@ -340,6 +343,10 @@ const std::string DB::Properties::kBlockCacheEntryStats = rocksdb_prefix + block_cache_entry_stats; const std::string DB::Properties::kFastBlockCacheEntryStats = rocksdb_prefix + fast_block_cache_entry_stats; +const std::string DB::Properties::kBlockCacheCfStats = + rocksdb_prefix + block_cache_cf_stats; +const std::string DB::Properties::kFastBlockCacheCfStats = + rocksdb_prefix + fast_block_cache_cf_stats; const std::string DB::Properties::kNumImmutableMemTable = rocksdb_prefix + num_immutable_mem_table; const std::string DB::Properties::kNumImmutableMemTableFlushed = @@ -476,6 +483,12 @@ const UnorderedMap {DB::Properties::kFastBlockCacheEntryStats, {true, &InternalStats::HandleFastBlockCacheEntryStats, nullptr, &InternalStats::HandleFastBlockCacheEntryStatsMap, nullptr}}, + {DB::Properties::kBlockCacheCfStats, + {true, &InternalStats::HandleBlockCacheCfStats, nullptr, + &InternalStats::HandleBlockCacheCfStatsMap, nullptr}}, + {DB::Properties::kFastBlockCacheCfStats, + {true, &InternalStats::HandleFastBlockCacheCfStats, nullptr, + &InternalStats::HandleFastBlockCacheCfStatsMap, nullptr}}, {DB::Properties::kSSTables, {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}}, {DB::Properties::kAggregatedTableProperties, @@ -676,14 +689,17 @@ void InternalStats::CollectCacheEntryStats(bool foreground) { } std::function + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id)> InternalStats::CacheEntryRoleStats::GetEntryCallback() { return [&](const Slice& /*key*/, Cache::ObjectPtr /*value*/, size_t charge, - const Cache::CacheItemHelper* helper) -> void { + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id) -> void { size_t role_idx = static_cast(helper ? helper->role : CacheEntryRole::kMisc); entry_counts[role_idx]++; total_charges[role_idx] += charge; + charge_per_item_owner[item_owner_id][role_idx] += charge; }; } @@ -744,6 +760,38 @@ std::string InternalStats::CacheEntryRoleStats::ToString( return str.str(); } +std::string InternalStats::CacheEntryRoleStats::CacheOwnerStatsToString( + const std::string& cf_name, Cache::ItemOwnerId cache_owner_id) { + std::ostringstream str; + + const auto& cf_charges_per_role_pos = + charge_per_item_owner.find(cache_owner_id); + + std::vector roles{CacheEntryRole::kDataBlock, + CacheEntryRole::kFilterBlock, + CacheEntryRole::kIndexBlock}; + std::array roles_total_charge{}; + + str << "Block cache [" << cf_name << "] "; + + if (cf_charges_per_role_pos != charge_per_item_owner.end()) { + const std::array& cf_charges_per_role = + cf_charges_per_role_pos->second; + for (auto role : roles) { + auto role_idx = static_cast(role); + roles_total_charge[role_idx] = cf_charges_per_role[role_idx]; + } + } + + for (auto role : roles) { + auto role_idx = static_cast(role); + str << " " << kCacheEntryRoleToCamelString[role_idx] << "(" + << BytesToHumanString(roles_total_charge[role_idx]) << ")"; + } + str << '\n'; + return str.str(); +} + void InternalStats::CacheEntryRoleStats::ToMap( std::map* values, SystemClock* clock) const { values->clear(); @@ -766,6 +814,25 @@ void InternalStats::CacheEntryRoleStats::ToMap( } } +void InternalStats::CacheEntryRoleStats::CacheOwnerStatsToMap( + const std::string& cf_name, Cache::ItemOwnerId cache_owner_id, + std::map* values) const { + values->clear(); + auto& v = *values; + v[BlockCacheCfStatsMapKeys::CfName()] = cf_name; + v[BlockCacheCfStatsMapKeys::CacheId()] = cache_id; + const auto& cache_owner_charges = charge_per_item_owner.find(cache_owner_id); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + auto role = static_cast(i); + if (cache_owner_charges != charge_per_item_owner.end()) { + v[BlockCacheCfStatsMapKeys::UsedBytes(role)] = + std::to_string(charge_per_item_owner.at(cache_owner_id)[i]); + } else { + v[BlockCacheCfStatsMapKeys::UsedBytes(role)] = "0"; + } + } +} + bool InternalStats::HandleBlockCacheEntryStatsInternal(std::string* value, bool fast) { if (!cache_entry_stats_collector_) { @@ -810,6 +877,51 @@ bool InternalStats::HandleFastBlockCacheEntryStatsMap( return HandleBlockCacheEntryStatsMapInternal(values, true /* fast */); } +bool InternalStats::HandleBlockCacheCfStatsInternal(std::string* value, + bool fast) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(!fast /* foreground */); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + *value = + stats.CacheOwnerStatsToString(cfd_->GetName(), cfd_->GetCacheOwnerId()); + return true; +} + +bool InternalStats::HandleBlockCacheCfStatsMapInternal( + std::map* values, bool fast) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(!fast /* foreground */); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + stats.CacheOwnerStatsToMap(cfd_->GetName(), cfd_->GetCacheOwnerId(), values); + return true; +} + +bool InternalStats::HandleBlockCacheCfStats(std::string* value, + Slice /*suffix*/) { + return HandleBlockCacheCfStatsInternal(value, false /* fast */); +} + +bool InternalStats::HandleBlockCacheCfStatsMap( + std::map* values, Slice /*suffix*/) { + return HandleBlockCacheCfStatsMapInternal(values, false /* fast */); +} + +bool InternalStats::HandleFastBlockCacheCfStats(std::string* value, + Slice /*suffix*/) { + return HandleBlockCacheCfStatsInternal(value, true /* fast */); +} + +bool InternalStats::HandleFastBlockCacheCfStatsMap( + std::map* values, Slice /*suffix*/) { + return HandleBlockCacheCfStatsMapInternal(values, true /* fast */); +} + bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix) { uint64_t temperature; @@ -2061,6 +2173,8 @@ void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, // Skip if stats are extremely old (> 1 day, incl not yet populated) if (now_micros - stats.last_end_time_micros_ < kDayInMicros) { value->append(stats.ToString(clock_)); + value->append(stats.CacheOwnerStatsToString(cfd_->GetName(), + cfd_->GetCacheOwnerId())); } } } diff --git a/db/internal_stats.h b/db/internal_stats.h index 7a600384a7..a058031e17 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -479,6 +479,13 @@ class InternalStats { uint64_t last_start_time_micros_ = 0; uint64_t last_end_time_micros_ = 0; + // Instances of this class are created per cf, but cache stats collection + // is expensive and cf-agnostic anyway. Therefore, we store the values + // for all cf-s in every instance. + std::unordered_map> + charge_per_item_owner; + void Clear() { // Wipe everything except collection_count uint32_t saved_collection_count = collection_count; @@ -488,7 +495,8 @@ class InternalStats { void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); std::function + const Cache::CacheItemHelper* helper, + Cache::ItemOwnerId item_owner_id)> GetEntryCallback(); void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); void SkippedCollection(); @@ -497,6 +505,12 @@ class InternalStats { void ToMap(std::map* values, SystemClock* clock) const; + std::string CacheOwnerStatsToString(const std::string& cf_name, + Cache::ItemOwnerId cache_owner_id); + void CacheOwnerStatsToMap(const std::string& cf_name, + Cache::ItemOwnerId cache_owner_id, + std::map* values) const; + private: uint64_t GetLastDurationMicros() const; }; @@ -845,6 +859,15 @@ class InternalStats { bool HandleFastBlockCacheEntryStats(std::string* value, Slice suffix); bool HandleFastBlockCacheEntryStatsMap( std::map* values, Slice suffix); + bool HandleBlockCacheCfStatsInternal(std::string* value, bool fast); + bool HandleBlockCacheCfStatsMapInternal( + std::map* values, bool fast); + bool HandleBlockCacheCfStats(std::string* value, Slice suffix); + bool HandleBlockCacheCfStatsMap(std::map* values, + Slice suffix); + bool HandleFastBlockCacheCfStats(std::string* value, Slice suffix); + bool HandleFastBlockCacheCfStatsMap( + std::map* values, Slice suffix); bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix); bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version); bool HandleBlobStats(std::string* value, Slice suffix); diff --git a/db/table_cache.cc b/db/table_cache.cc index 366d53948f..fbf626b0ed 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -137,15 +137,18 @@ Status TableCache::GetTableReader( } else { expected_unique_id = kNullUniqueId64x2; // null ID == no verification } + + TableReaderOptions table_reader_options( + ioptions_, prefix_extractor, file_options, internal_comparator, + skip_filters, immortal_tables_, false /* force_direct_prefetch */, + level, is_bottom, block_cache_tracer_, max_file_size_for_l0_meta_pin, + db_session_id_, file_meta.fd.GetNumber(), expected_unique_id, + file_meta.fd.largest_seqno); + table_reader_options.cache_owner_id = cache_owner_id_; + s = ioptions_.table_factory->NewTableReader( - ro, - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, is_bottom, - block_cache_tracer_, max_file_size_for_l0_meta_pin, - db_session_id_, file_meta.fd.GetNumber(), - expected_unique_id, file_meta.fd.largest_seqno), - std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, + ro, table_reader_options, std::move(file_reader), + file_meta.fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); } diff --git a/db/table_cache.h b/db/table_cache.h index 66282bf41f..76a0239aa2 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -227,6 +227,10 @@ class TableCache { } } + void SetBlockCacheOwnerId(Cache::ItemOwnerId cache_owner_id) { + cache_owner_id_ = cache_owner_id; + } + private: // Build a table reader Status GetTableReader( @@ -268,6 +272,7 @@ class TableCache { Striped loader_mutex_; std::shared_ptr io_tracer_; std::string db_session_id_; + Cache::ItemOwnerId cache_owner_id_ = Cache::kUnknownItemId; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h index bd7d5b09c8..6879efcde9 100644 --- a/include/rocksdb/advanced_cache.h +++ b/include/rocksdb/advanced_cache.h @@ -9,7 +9,10 @@ #include #include +#include +#include #include +#include #include #include "rocksdb/cache.h" @@ -65,6 +68,12 @@ class Cache { // not set. The "bottom" priority level is for BlobDB's blob values. enum class Priority { HIGH, LOW, BOTTOM }; + using ItemOwnerId = uint16_t; + static constexpr ItemOwnerId kUnknownItemId = 0U; + static constexpr ItemOwnerId kMinOwnerItemId = 1U; + static constexpr ItemOwnerId kMaxOwnerItemId = + std::numeric_limits::max(); + // A set of callbacks to allow objects in the primary block cache to be // be persisted in a secondary cache. The purpose of the secondary cache // is to support other ways of caching the object, such as persistent or @@ -249,6 +258,14 @@ class Cache { Handle** handle = nullptr, Priority priority = Priority::LOW) = 0; + virtual Status InsertWithOwnerId(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + ItemOwnerId /* item_owner_id */, + Handle** handle = nullptr, + Priority priority = Priority::LOW) { + return Insert(key, obj, helper, charge, handle, priority); + } + // Similar to Insert, but used for creating cache entries that cannot // be found with Lookup, such as for memory charging purposes. The // key is needed for cache sharding purposes. @@ -389,6 +406,21 @@ class Cache { const CacheItemHelper* helper)>& callback, const ApplyToAllEntriesOptions& opts) = 0; + virtual void ApplyToAllEntriesWithOwnerId( + const std::function& + callback_with_owner_id, + const ApplyToAllEntriesOptions& opts) { + auto callback = [&callback_with_owner_id](const Slice& key, ObjectPtr obj, + size_t charge, + const CacheItemHelper* helper) { + callback_with_owner_id(key, obj, charge, helper, Cache::kUnknownItemId); + }; + + return ApplyToAllEntries(callback, opts); + } + // Remove all entries. // Prerequisite: no entry is referenced. virtual void EraseUnRefEntries() = 0; @@ -517,9 +549,32 @@ class Cache { // or destruction, guaranteed before or after any thread-shared operations. void SetEvictionCallback(EvictionCallback&& fn); + ItemOwnerId GetNextItemOwnerId(); + // On return will set the owner id to kUnknownItemId + void DiscardItemOwnerId(ItemOwnerId*); + protected: std::shared_ptr memory_allocator_; EvictionCallback eviction_callback_; + + public: + // Public so it is accessible from the unit tests (Just a constant) + static constexpr size_t kMaxFreeItemOwnersIdListSize = 10000U; + + private: + class ItemOwnerIdAllocator { + public: + ItemOwnerId Allocate(); + void Free(ItemOwnerId* id); + + private: + ItemOwnerId next_item_owner_id_ = kMinOwnerItemId; + bool has_wrapped_around_ = false; + std::mutex free_ids_mutex_; + std::list free_ids_; + }; + + ItemOwnerIdAllocator owner_id_allocator_; }; // A wrapper around Cache that can easily be extended with instrumentation, diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 387da17539..14757440de 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -88,6 +88,15 @@ struct BlockCacheEntryStatsMapKeys { static std::string UsedPercent(CacheEntryRole); }; +// For use with `GetMapProperty()` for property +// `DB::Properties::kBlockCacheCfStats`. On success, the map will +// be populated with all keys that can be obtained from these functions. +struct BlockCacheCfStatsMapKeys { + static const std::string& CfName(); + static const std::string& CacheId(); + static std::string UsedBytes(CacheEntryRole); +}; + extern const bool kDefaultToAdaptiveMutex; enum CacheMetadataChargePolicy { diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 6539eb8aeb..a385adb6fe 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -986,6 +986,14 @@ class DB { // stale values more frequently to reduce overhead and latency. static const std::string kFastBlockCacheEntryStats; + // "rocksdb.block-cache-cf-stats" - returns a multi-line string + // with statistics on block cache usage for a specific column-family. + static const std::string kBlockCacheCfStats; + + // "rocksdb.fast-block-cache-cf-stats" - same as above, but returns + // stale values more frequently to reduce overhead and latency. + static const std::string kFastBlockCacheCfStats; + // "rocksdb.num-immutable-mem-table" - returns number of immutable // memtables that have not yet been flushed. static const std::string kNumImmutableMemTable; diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 1af24898eb..d83ca0d3b2 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -585,7 +585,7 @@ Status BlockBasedTableFactory::NewTableReader( table_reader_options.block_cache_tracer, table_reader_options.max_file_size_for_l0_meta_pin, table_reader_options.cur_db_session_id, table_reader_options.cur_file_num, - table_reader_options.unique_id); + table_reader_options.unique_id, table_reader_options.cache_owner_id); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index a0119da10b..6c0f0022ae 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -570,7 +570,8 @@ Status BlockBasedTable::Open( TailPrefetchStats* tail_prefetch_stats, BlockCacheTracer* const block_cache_tracer, size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id, - uint64_t cur_file_num, UniqueId64x2 expected_unique_id) { + uint64_t cur_file_num, UniqueId64x2 expected_unique_id, + Cache::ItemOwnerId cache_owner_id) { table_reader->reset(); Status s; @@ -629,9 +630,9 @@ Status BlockBasedTable::Open( } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; - Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, - internal_comparator, skip_filters, - file_size, level, immortal_table); + Rep* rep = new BlockBasedTable::Rep( + ioptions, env_options, table_options, internal_comparator, skip_filters, + file_size, level, immortal_table, cache_owner_id); rep->file = std::move(file); rep->footer = footer; @@ -1327,7 +1328,8 @@ WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( BlockCacheTypedHandle* cache_handle = nullptr; s = block_cache.InsertFull(cache_key, block_holder.get(), charge, &cache_handle, GetCachePriority(), - rep_->ioptions.lowest_used_cache_tier); + rep_->ioptions.lowest_used_cache_tier, + rep_->cache_owner_id); if (s.ok()) { assert(cache_handle != nullptr); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index b60f815dc3..8fa406315d 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -112,7 +112,8 @@ class BlockBasedTable : public TableReader { BlockCacheTracer* const block_cache_tracer = nullptr, size_t max_file_size_for_l0_meta_pin = 0, const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0, - UniqueId64x2 expected_unique_id = {}); + UniqueId64x2 expected_unique_id = {}, + Cache::ItemOwnerId cache_owner_id = Cache::kUnknownItemId); bool PrefixRangeMayMatch(const Slice& internal_key, const ReadOptions& read_options, @@ -532,7 +533,8 @@ struct BlockBasedTable::Rep { Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, const InternalKeyComparator& _internal_comparator, bool skip_filters, - uint64_t _file_size, int _level, const bool _immortal_table) + uint64_t _file_size, int _level, const bool _immortal_table, + Cache::ItemOwnerId _cache_owner_id = Cache::kUnknownItemId) : ioptions(_ioptions), env_options(_env_options), table_options(_table_opt), @@ -545,7 +547,8 @@ struct BlockBasedTable::Rep { global_seqno(kDisableGlobalSequenceNumber), file_size(_file_size), level(_level), - immortal_table(_immortal_table) {} + immortal_table(_immortal_table), + cache_owner_id(_cache_owner_id) {} ~Rep() { status.PermitUncheckedError(); } const ImmutableOptions& ioptions; const EnvOptions& env_options; @@ -613,6 +616,8 @@ struct BlockBasedTable::Rep { const bool immortal_table; + Cache::ItemOwnerId cache_owner_id = Cache::kUnknownItemId; + std::unique_ptr table_reader_cache_res_handle = nullptr; diff --git a/table/table_builder.h b/table/table_builder.h index 7c3388e4cd..01bba03fda 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -20,6 +20,7 @@ #include "db/table_properties_collector.h" #include "file/writable_file_writer.h" #include "options/cf_options.h" +#include "rocksdb/cache.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" #include "table/unique_id_impl.h" @@ -90,6 +91,8 @@ struct TableReaderOptions { // Known unique_id or {}, kNullUniqueId64x2 means unknown UniqueId64x2 unique_id; + + Cache::ItemOwnerId cache_owner_id = Cache::kUnknownItemId; }; struct TableBuilderOptions {