From 38458918da253b45d90f308791b32ffb510e354e Mon Sep 17 00:00:00 2001 From: ptrus Date: Mon, 30 Oct 2023 14:05:58 +0100 Subject: [PATCH 01/28] wip --- go/common/keyformat/key_format.go | 2 +- go/consensus/cometbft/abci/state.go | 2 +- go/go.mod | 1 + go/go.sum | 2 + go/nexus-genesis.json | 300 +++++ go/storage/database/database.go | 11 + go/storage/database/database_test.go | 1 + go/storage/mkvs/checkpoint/checkpoint_test.go | 126 +- .../mkvs/checkpoint/checkpointer_test.go | 38 +- go/storage/mkvs/db/api/api.go | 4 +- go/storage/mkvs/db/badger/badger.go | 45 +- go/storage/mkvs/db/badger/badger_test.go | 86 -- go/storage/mkvs/db/badger/check.go | 4 +- go/storage/mkvs/db/badger/metadata.go | 5 +- go/storage/mkvs/db/badger/migrate.go | 44 +- go/storage/mkvs/db/db_test.go | 226 +++- go/storage/mkvs/db/rocksdb/batch.go | 246 ++++ go/storage/mkvs/db/rocksdb/iterator.go | 153 +++ go/storage/mkvs/db/rocksdb/metadata.go | 153 +++ go/storage/mkvs/db/rocksdb/rocksdb.go | 1025 +++++++++++++++++ go/storage/mkvs/db/rocksdb/rocksdb_test.go | 299 +++++ go/storage/mkvs/db/rocksdb/timestamp.go | 78 ++ go/storage/mkvs/{db/badger => node}/hash.go | 43 +- go/storage/mkvs/tree_test.go | 25 + 24 files changed, 2727 insertions(+), 192 deletions(-) create mode 100644 go/nexus-genesis.json create mode 100644 go/storage/mkvs/db/rocksdb/batch.go create mode 100644 go/storage/mkvs/db/rocksdb/iterator.go create mode 100644 go/storage/mkvs/db/rocksdb/metadata.go create mode 100644 go/storage/mkvs/db/rocksdb/rocksdb.go create mode 100644 go/storage/mkvs/db/rocksdb/rocksdb_test.go create mode 100644 go/storage/mkvs/db/rocksdb/timestamp.go rename go/storage/mkvs/{db/badger => node}/hash.go (58%) diff --git a/go/common/keyformat/key_format.go b/go/common/keyformat/key_format.go index 82fa35a5c41..8f3af2d9da1 100644 --- a/go/common/keyformat/key_format.go +++ b/go/common/keyformat/key_format.go @@ -157,7 +157,7 @@ func (k *KeyFormat) Encode(values ...interface{}) []byte { panic(fmt.Sprintf("key format: failed to marshal element %d: %s", i, err)) } if len(data) != meta.size { - panic(fmt.Sprintf("key format: unexpected marshalled size %d for element %d", len(data), i)) + panic(fmt.Sprintf("key format: unexpected marshalled size %d for element %d (expected: %d)", len(data), i, meta.size)) } copy(buf[:], data) diff --git a/go/consensus/cometbft/abci/state.go b/go/consensus/cometbft/abci/state.go index caa7e65db0b..88c993fe8a4 100644 --- a/go/consensus/cometbft/abci/state.go +++ b/go/consensus/cometbft/abci/state.go @@ -536,7 +536,7 @@ func (s *applicationState) resetProposalIfChanged(h []byte) bool { } func (s *applicationState) updateMetrics() error { - var dbSize int64 + var dbSize uint64 var err error if dbSize, err = s.storage.NodeDB().Size(); err != nil { s.logger.Error("Size", diff --git a/go/go.mod b/go/go.mod index 047072daf9f..8a11daa8984 100644 --- a/go/go.mod +++ b/go/go.mod @@ -126,6 +126,7 @@ require ( github.com/libp2p/go-netroute v0.2.1 // indirect github.com/libp2p/go-reuseport v0.4.0 // indirect github.com/libp2p/go-yamux/v4 v4.0.1 // indirect + github.com/linxGnu/grocksdb v1.8.4 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd // indirect github.com/mattn/go-colorable v0.1.13 // indirect diff --git a/go/go.sum b/go/go.sum index d67be5e4130..1fb1b3f5ae5 100644 --- a/go/go.sum +++ b/go/go.sum @@ -423,6 +423,8 @@ github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQsc github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5wwmtQP1YB4= +github.com/linxGnu/grocksdb v1.8.4 h1:ZMsBpPpJNtRLHiKKp0mI7gW+NT4s7UgfD5xHxx1jVRo= +github.com/linxGnu/grocksdb v1.8.4/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= diff --git a/go/nexus-genesis.json b/go/nexus-genesis.json new file mode 100644 index 00000000000..5815b30aab8 --- /dev/null +++ b/go/nexus-genesis.json @@ -0,0 +1,300 @@ +{ + "height": 1, + "genesis_time": "2022-09-26T04:06:00.331342488Z", + "chain_id": "oasis-3", + "comment ^": "This is not the real mainnet. However, we match its chain_id so that we can test sql migrations (which use the chain_id as part of the fully qualified table names)", + "registry": { + "params": { + "debug_allow_unroutable_addresses": true, + "debug_allow_test_runtimes": true, + "gas_costs": { + "deregister_entity": 1000, + "register_entity": 1000, + "register_node": 1000, + "register_runtime": 1000, + "runtime_epoch_maintenance": 1000, + "unfreeze_node": 1000, + "update_keymanager": 1000 + }, + "max_node_expiration": 5, + "enable_runtime_governance_models": { + "entity": true, + "runtime": true + } + }, + "entities": [ + { + "untrusted_raw_value": "o2F2AmJpZFggJTUtHd4XYQjh//e6eYU7Pa/XMFG88WE+jixvceIfWrllbm9kZXOBWCAtC7hm0WDw4nQwLgzhAx5RHsizpe3gD8Jb48r/tM+IfQ==", + "signature": { + "public_key": "JTUtHd4XYQjh//e6eYU7Pa/XMFG88WE+jixvceIfWrk=", + "signature": "L9wqH/IjJ3AdgledgK/1qU86f5kKjWy/zpd3cS8YkOYZyQBi+z98wNANy6ACiW3kpAD5uI/qcTg/ez+nE7dJCw==" + } + }, + { + "untrusted_raw_value": "omF2AmJpZFgg+MJpnSTzc11dNI5emMa+asCJH5cxBiBCcpbYE4XBdso=", + "signature": { + "public_key": "+MJpnSTzc11dNI5emMa+asCJH5cxBiBCcpbYE4XBdso=", + "signature": "LXp4Fl89LDJyqHR92PWfIN5yod+eOwZBfresKfiEoMReuZugzbIFjMuBrC8ruazd/UAbfU6r6MPIM4H6YQOvCA==" + } + }, + { + "untrusted_raw_value": "omF2AmJpZFggHc0T6ypN7Ytv3t+n5LyJjCd93geUMTo82BR8iS1sRkY=", + "signature": { + "public_key": "Hc0T6ypN7Ytv3t+n5LyJjCd93geUMTo82BR8iS1sRkY=", + "signature": "IRxC/BtxNGVjN21fzNGu/rNm1FN0wTwdrffEUJzigPOuSaP6dxNtxxKYWm89AMIByc29g2qtv33UPImDwH0lDQ==" + } + }, + { + "untrusted_raw_value": "omF2AmJpZFggBTDHzkZYnKPQKUcnN4ieYpLBKRLMu/tmktwpbLduAj8=", + "signature": { + "public_key": "BTDHzkZYnKPQKUcnN4ieYpLBKRLMu/tmktwpbLduAj8=", + "signature": "VDTt/fqiGFjFYUvAr7ar2LMSzlb5FEQnjzpJYrgUlhdFKukJI3cyCu0RApy9d4LM+eiv3L8uj5nmHLItovX2Aw==" + } + }, + { + "untrusted_raw_value": "omF2AmJpZFggkNGfL2S9wpni8G2phHcbUYRmnwSsZFny90VFTMXvDKo=", + "signature": { + "public_key": "kNGfL2S9wpni8G2phHcbUYRmnwSsZFny90VFTMXvDKo=", + "signature": "Dpa4leeFoFLd2OK5AsxqVghSivB8p2dibzVERJgODzGZDcoFpTurTX2g+MuIEoIP4TjWWr432l4i9lW1nniJAg==" + } + }, + { + "untrusted_raw_value": "omF2AmJpZFggTqUyj5Q+9vZtqu10yw6Zw7HEX3Ywe0JQA9vHyzY47TU=", + "signature": { + "public_key": "TqUyj5Q+9vZtqu10yw6Zw7HEX3Ywe0JQA9vHyzY47TU=", + "signature": "UiuPWSjYdXih3iEGBTPyWTWoXb4cXfPQi4xnrlgooXpin876uZe4Uy3D5tLYrqDCGpDJ+r/8r0gTJ6aIwCuvBQ==" + } + } + ], + "nodes": [ + { + "untrusted_raw_value": "q2F2AmJpZFggLQu4ZtFg8OJ0MC4M4QMeUR7Is6Xt4A/CW+PK/7TPiH1jcDJwomJpZFgg3NaiXoRM24g/ICmKIG3/UO0OQxe+2irGUZ7rWh8J+TBpYWRkcmVzc2Vz9mN0bHOjZ3B1Yl9rZXlYIJ7u/tQnAGPpg60PtX1KvTS5WvJ7xPLQtiO4rd3x0jOjaWFkZHJlc3Nlc/ZsbmV4dF9wdWJfa2V5WCC/5Vy8kvE846VcbsK3Haujk9mJKeBnKBnn45riiVdgemN2cmahYmlkWCBKt2yZ3y0U29agfBjZ6cyrZE2gQIHBOS0zBrZ5wMbKLmVyb2xlcwhocnVudGltZXP2aWNvbnNlbnN1c6JiaWRYIChZHJhhMXcGCXa4Jw9tMG5wERuu5KB0GZOVx0iJhjekaWFkZHJlc3Nlc4GiYmlkWCDc1qJehEzbiD8gKYogbf9Q7Q5DF77aKsZRnutaHwn5MGdhZGRyZXNzo2JJUFAAAAAAAAAAAAAA//9/AAABZFBvcnQZTiFkWm9uZWBpZW50aXR5X2lkWCAlNS0d3hdhCOH/97p5hTs9r9cwUbzxYT6OLG9x4h9auWpleHBpcmF0aW9uAXBzb2Z0d2FyZV92ZXJzaW9uZzIyLjEuMTA=", + "signatures": [ + { + "public_key": "LQu4ZtFg8OJ0MC4M4QMeUR7Is6Xt4A/CW+PK/7TPiH0=", + "signature": "GYM39hlnmmJFP0CjQqxfQnJUC/QMqEC8HGHuJaZVIBxTWMM87qjdbic6q0qQTS3F/MPm33WYCnBd2WJx0rKdAw==" + }, + { + "public_key": "3NaiXoRM24g/ICmKIG3/UO0OQxe+2irGUZ7rWh8J+TA=", + "signature": "i052fsAWa/t6QscGq7FqGwB70Fq9/p/ed2nnAPFlcba1LMZ83lzSLTlvZ/M1z8Gz4AlEJqI0unPqOiUndIxeAQ==" + }, + { + "public_key": "KFkcmGExdwYJdrgnD20wbnARG67koHQZk5XHSImGN6Q=", + "signature": "4/UsmJNxa8qhTYInLd3cRkDqJhtXpM136DfoZJxcWTomHeMeGQG6xHuAwuFiWl8PtAsLnOYWDErUxJ0qI/PtCw==" + }, + { + "public_key": "Srdsmd8tFNvWoHwY2enMq2RNoECBwTktMwa2ecDGyi4=", + "signature": "wKaq0rD0FrC58ZsIC0L3WI9/pys6PpvMAUpVsRmmMMly9fLgBrEcO30uNwSxgPRwCrA0efVUX189nLAjVgzkBw==" + }, + { + "public_key": "nu7+1CcAY+mDrQ+1fUq9NLla8nvE8tC2I7it3fHSM6M=", + "signature": "vbjuhiDjBg6FbOZOIN6ac7X9enMVCU4kkGz/7oYPG6gAKAhq3bb3A+VdJvULVU+9CMODNV8DI5sKh0xR6eJHBA==" + } + ] + } + ] + }, + "roothash": { + "params": { + "gas_costs": { + "compute_commit": 1000, + "evidence": 1000, + "proposer_timeout": 1000, + "submit_msg": 1000 + }, + "max_runtime_messages": 128, + "max_in_runtime_messages": 128, + "max_evidence_age": 0 + } + }, + "staking": { + "params": { + "thresholds": { + "entity": "0", + "node-compute": "0", + "node-keymanager": "0", + "node-validator": "0", + "runtime-compute": "0", + "runtime-keymanager": "0", + "node-observer": "0" + }, + "debonding_interval": 1, + "commission_schedule_rules": {}, + "min_delegation": "0", + "min_transfer": "0", + "min_transact_balance": "0", + "fee_split_weight_propose": "0", + "fee_split_weight_vote": "1", + "fee_split_weight_next_propose": "0", + "reward_factor_epoch_signed": "0", + "reward_factor_block_proposed": "0" + }, + "token_symbol": "TEST", + "token_value_exponent": 6, + "total_supply": "5200000000000", + "common_pool": "0", + "last_block_fees": "0", + "governance_deposits": "0", + "ledger": { + "oasis1qqczr9vgvp9gysgv0jx3ywww4gccyhqq3g8aygw4": { + "general": { + "balance": "1000000000000" + }, + "escrow": { + "active": { + "balance": "0", + "total_shares": "0" + }, + "debonding": { + "balance": "0", + "total_shares": "0" + }, + "commission_schedule": {}, + "stake_accumulator": {} + } + }, + "oasis1qqncl383h8458mr9cytatygctzwsx02n4c5f8ed7": { + "general": { + "balance": "1000000000000" + }, + "escrow": { + "active": { + "balance": "0", + "total_shares": "0" + }, + "debonding": { + "balance": "0", + "total_shares": "0" + }, + "commission_schedule": {}, + "stake_accumulator": {} + } + }, + "oasis1qqw3ka3eeuy5qaytyhesxtj4fe5pp0xkdy954uwk": { + "general": { + "balance": "1000000000000" + }, + "escrow": { + "active": { + "balance": "0", + "total_shares": "0" + }, + "debonding": { + "balance": "0", + "total_shares": "0" + }, + "commission_schedule": {}, + "stake_accumulator": {} + } + }, + "oasis1qrz6kjp9lu6vc6snhlszq3p2nlx76qasaqr2auqk": { + "general": { + "balance": "1000000000000" + }, + "escrow": { + "active": { + "balance": "0", + "total_shares": "0" + }, + "debonding": { + "balance": "0", + "total_shares": "0" + }, + "commission_schedule": {}, + "stake_accumulator": {} + } + }, + "oasis1qznshq4ttrgh83d9wqvgmsuq3pfsndg3tus7lx98": { + "general": { + "balance": "1000000000000" + }, + "escrow": { + "active": { + "balance": "0", + "total_shares": "0" + }, + "debonding": { + "balance": "0", + "total_shares": "0" + }, + "commission_schedule": {}, + "stake_accumulator": {} + } + }, + "oasis1qzzd6khm3acqskpxlk9vd5044cmmcce78y5l6000": { + "general": { + "balance": "100000000000" + }, + "escrow": { + "active": { + "balance": "100000000000", + "total_shares": "1" + }, + "debonding": { + "balance": "0", + "total_shares": "0" + }, + "commission_schedule": {}, + "stake_accumulator": {} + } + } + }, + "delegations": { + "oasis1qzzd6khm3acqskpxlk9vd5044cmmcce78y5l6000": { + "oasis1qzzd6khm3acqskpxlk9vd5044cmmcce78y5l6000": { + "shares": "1" + } + } + } + }, + "keymanager": {}, + "scheduler": { + "params": { + "min_validators": 1, + "max_validators": 100, + "max_validators_per_entity": 1, + "reward_factor_epoch_election_any": "0" + } + }, + "beacon": { + "base": 0, + "params": { + "backend": "insecure", + "insecure_parameters": { + "interval": 30 + } + } + }, + "governance": { + "params": { + "gas_costs": { + "cast_vote": 1000, + "submit_proposal": 1000 + }, + "min_proposal_deposit": "100", + "voting_period": 100, + "stake_threshold": 90, + "upgrade_min_epoch_diff": 300, + "upgrade_cancel_min_epoch_diff": 300 + } + }, + "consensus": { + "backend": "tendermint", + "params": { + "timeout_commit": 1000000000, + "skip_timeout_commit": false, + "empty_block_interval": 0, + "max_tx_size": 32768, + "max_block_size": 22020096, + "max_block_gas": 0, + "max_evidence_size": 1048576, + "state_checkpoint_interval": 0, + "state_checkpoint_chunk_size": 8388608, + "gas_costs": { + "tx_byte": 0 + } + } + }, + "halt_epoch": 86400, + "extra_data": null +} diff --git a/go/storage/database/database.go b/go/storage/database/database.go index a91879cebb3..684c9baa17c 100644 --- a/go/storage/database/database.go +++ b/go/storage/database/database.go @@ -12,15 +12,22 @@ import ( "github.com/oasisprotocol/oasis-core/go/storage/mkvs/checkpoint" nodedb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" badgerNodedb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/badger" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/rocksdb" ) const ( // BackendNameBadgerDB is the name of the BadgeDB backed database backend. BackendNameBadgerDB = "badger" + // BackendNameRocksDB is the name of the RocksDB backed database backend. + BackendNameRocksDB = "rocksdb" + // DBFileBadgerDB is the default BadgerDB backing store filename. DBFileBadgerDB = "mkvs_storage.badger.db" + // DBFileRocksDB is the default RocksDB backing store filename. + DBFileRocksDB = "mkvs_storage.rocksdb.db" + checkpointDir = "checkpoints" ) @@ -30,6 +37,8 @@ func DefaultFileName(backend string) string { switch backend { case BackendNameBadgerDB: return DBFileBadgerDB + case BackendNameRocksDB: + return DBFileRocksDB default: panic("storage/database: can't get default filename for unknown backend") } @@ -56,6 +65,8 @@ func New(cfg *api.Config) (api.LocalBackend, error) { switch cfg.Backend { case BackendNameBadgerDB: ndb, err = badgerNodedb.New(ndbCfg) + case BackendNameRocksDB: + ndb, err = rocksdb.New(ndbCfg) default: err = errors.New("storage/database: unsupported backend") } diff --git a/go/storage/database/database_test.go b/go/storage/database/database_test.go index 5b44baf16ec..afec10e1067 100644 --- a/go/storage/database/database_test.go +++ b/go/storage/database/database_test.go @@ -16,6 +16,7 @@ import ( func TestStorageDatabase(t *testing.T) { for _, v := range []string{ BackendNameBadgerDB, + BackendNameRocksDB, } { t.Run(v, func(t *testing.T) { doTestImpl(t, v) diff --git a/go/storage/mkvs/checkpoint/checkpoint_test.go b/go/storage/mkvs/checkpoint/checkpoint_test.go index d7c2fffca29..62e0d8bc984 100644 --- a/go/storage/mkvs/checkpoint/checkpoint_test.go +++ b/go/storage/mkvs/checkpoint/checkpoint_test.go @@ -1,4 +1,4 @@ -package checkpoint +package checkpoint_test import ( "bytes" @@ -16,15 +16,93 @@ import ( "github.com/oasisprotocol/oasis-core/go/common" "github.com/oasisprotocol/oasis-core/go/common/cbor" "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" + "github.com/oasisprotocol/oasis-core/go/storage/api" "github.com/oasisprotocol/oasis-core/go/storage/mkvs" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/checkpoint" db "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" badgerDb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/badger" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/rocksdb" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" ) +type NodeDBFactory func(cfg *db.Config) (db.NodeDB, error) + var testNs = common.NewTestNamespaceFromSeed([]byte("oasis mkvs checkpoint test ns"), 0) -func TestFileCheckpointCreator(t *testing.T) { +func TestBadgerBackend(t *testing.T) { + testBackend(t, func(t *testing.T) (NodeDBFactory, func()) { + // Create a new random temporary directory under /tmp. + dir, err := os.MkdirTemp("", "mkvs.checkpoint_test.badger") + require.NoError(t, err, "TempDir") + + // Create a Badger-backed Node DB factory. + factory := func(cfg *db.Config) (db.NodeDB, error) { + return badgerDb.New(cfg) + } + + cleanup := func() { + os.RemoveAll(dir) + } + + return factory, cleanup + }, nil) +} + +func TestRocksDBBackend(t *testing.T) { + testBackend(t, func(t *testing.T) (NodeDBFactory, func()) { + // Create a new random temporary directory under /tmp. + dir, err := os.MkdirTemp("", "mkvs.checkpoint_test.rocksdb") + require.NoError(t, err, "TempDir") + + // Create a RocksDB-backed Node DB factory. + factory := func(cfg *db.Config) (api.NodeDB, error) { + if cfg.DB == "" { + cfg.DB = dir + } + return rocksdb.New(cfg) + } + + cleanup := func() { + os.RemoveAll(dir) + } + + return factory, cleanup + }, nil) +} + +func testBackend( + t *testing.T, + initBackend func(t *testing.T) (NodeDBFactory, func()), + skipTests []string, +) { + tests := []struct { + name string + fn func(*testing.T, NodeDBFactory) + }{ + {"FileCheckpointCreator", testFileCheckpointCreator}, + {"OversizedChunks", testOversizedChunks}, + {"PruneGapAfterCheckpointRestore", testPruneGapAfterCheckpointRestore}, + } + + skipMap := make(map[string]bool, len(skipTests)) + for _, name := range skipTests { + skipMap[name] = true + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if skipMap[tc.name] { + t.Skip("skipping test for this backend") + } + + factory, cleanup := initBackend(t) + defer cleanup() + tc.fn(t, factory) + }) + } +} + +func testFileCheckpointCreator(t *testing.T, new NodeDBFactory) { require := require.New(t) // Generate some data. @@ -32,7 +110,7 @@ func TestFileCheckpointCreator(t *testing.T) { require.NoError(err, "TempDir") defer os.RemoveAll(dir) - ndb, err := badgerDb.New(&db.Config{ + ndb, err := new(&db.Config{ DB: filepath.Join(dir, "db"), Namespace: testNs, MaxCacheSize: 16 * 1024 * 1024, @@ -56,11 +134,11 @@ func TestFileCheckpointCreator(t *testing.T) { } // Create a file-based checkpoint creator. - fc, err := NewFileCreator(filepath.Join(dir, "checkpoints"), ndb) + fc, err := checkpoint.NewFileCreator(filepath.Join(dir, "checkpoints"), ndb) require.NoError(err, "NewFileCreator") // There should be no checkpoints before one is created. - cps, err := fc.GetCheckpoints(ctx, &GetCheckpointsRequest{}) + cps, err := fc.GetCheckpoints(ctx, &checkpoint.GetCheckpointsRequest{}) require.NoError(err, "GetCheckpoints") require.Len(cps, 0) @@ -86,7 +164,7 @@ func TestFileCheckpointCreator(t *testing.T) { require.EqualValues(expectedChunks, cp.Chunks, "chunk hashes should be correct") // There should now be one checkpoint. - cps, err = fc.GetCheckpoints(ctx, &GetCheckpointsRequest{Version: 1}) + cps, err = fc.GetCheckpoints(ctx, &checkpoint.GetCheckpointsRequest{Version: 1}) require.NoError(err, "GetCheckpoints") require.Len(cps, 1, "there should be one checkpoint") require.Equal(cp, cps[0], "checkpoint returned by GetCheckpoint should be correct") @@ -117,7 +195,7 @@ func TestFileCheckpointCreator(t *testing.T) { require.Error(err, "GetChunk on a non-existent chunk should fail") // Create a fresh node database to restore into. - ndb2, err := badgerDb.New(&db.Config{ + ndb2, err := new(&db.Config{ DB: filepath.Join(dir, "db2"), Namespace: testNs, MaxCacheSize: 16 * 1024 * 1024, @@ -125,12 +203,12 @@ func TestFileCheckpointCreator(t *testing.T) { require.NoError(err, "New") // Try to restore some chunks. - rs, err := NewRestorer(ndb2) + rs, err := checkpoint.NewRestorer(ndb2) require.NoError(err, "NewRestorer") _, err = rs.RestoreChunk(ctx, 0, &buf) require.Error(err, "RestoreChunk should fail when no restore is in progress") - require.True(errors.Is(err, ErrNoRestoreInProgress)) + require.True(errors.Is(err, checkpoint.ErrNoRestoreInProgress)) // Generate a bogus manifest which does not verify by corrupting chunk at index 1. bogusCp, err := fc.GetCheckpoint(ctx, 1, root) @@ -153,7 +231,7 @@ func TestFileCheckpointCreator(t *testing.T) { err = rs.StartRestore(ctx, bogusCp) require.NoError(err, "StartRestore") for i := 0; i < len(bogusCp.Chunks); i++ { - var cm *ChunkMetadata + var cm *checkpoint.ChunkMetadata cm, err = cp.GetChunkMetadata(uint64(i)) require.NoError(err, "GetChunkMetadata") @@ -170,7 +248,7 @@ func TestFileCheckpointCreator(t *testing.T) { require.False(done, "RestoreChunk should not signal completed restoration") if i == 1 { require.Error(err, "RestoreChunk should fail with bogus chunk") - require.True(errors.Is(err, ErrChunkProofVerificationFailed)) + require.True(errors.Is(err, checkpoint.ErrChunkProofVerificationFailed)) // Restorer should be reset. break } @@ -183,12 +261,12 @@ func TestFileCheckpointCreator(t *testing.T) { require.NoError(err, "StartRestore") err = rs.StartRestore(ctx, cp) require.Error(err, "StartRestore should fail when a restore is already in progress") - require.True(errors.Is(err, ErrRestoreAlreadyInProgress)) + require.True(errors.Is(err, checkpoint.ErrRestoreAlreadyInProgress)) rcp := rs.GetCurrentCheckpoint() require.EqualValues(rcp, cp, "GetCurrentCheckpoint should return the checkpoint being restored") require.NotSame(rcp, cp, "GetCurrentCheckpoint should return a copy") for i := 0; i < len(cp.Chunks); i++ { - var cm *ChunkMetadata + var cm *checkpoint.ChunkMetadata cm, err = cp.GetChunkMetadata(uint64(i)) require.NoError(err, "GetChunkMetadata") @@ -213,7 +291,7 @@ func TestFileCheckpointCreator(t *testing.T) { _, err = rs.RestoreChunk(ctx, uint64(i), &buf) require.Error(err, "RestoreChunk should fail if the same chunk has already been restored") - require.True(errors.Is(err, ErrChunkAlreadyRestored)) + require.True(errors.Is(err, checkpoint.ErrChunkAlreadyRestored)) } } err = ndb2.Finalize([]node.Root{root}) @@ -233,7 +311,7 @@ func TestFileCheckpointCreator(t *testing.T) { require.NoError(err, "DeleteCheckpoint") // There should now be no checkpoints. - cps, err = fc.GetCheckpoints(ctx, &GetCheckpointsRequest{Version: 1}) + cps, err = fc.GetCheckpoints(ctx, &checkpoint.GetCheckpointsRequest{Version: 1}) require.NoError(err, "GetCheckpoints") require.Len(cps, 0, "there should be no checkpoints") @@ -259,7 +337,7 @@ func TestFileCheckpointCreator(t *testing.T) { require.Error(err, "CreateCheckpoint should fail for invalid root") } -func TestOversizedChunks(t *testing.T) { +func testOversizedChunks(t *testing.T, new NodeDBFactory) { require := require.New(t) // Generate some data. @@ -267,7 +345,7 @@ func TestOversizedChunks(t *testing.T) { require.NoError(err, "TempDir") defer os.RemoveAll(dir) - ndb, err := badgerDb.New(&db.Config{ + ndb, err := new(&db.Config{ DB: filepath.Join(dir, "db"), Namespace: testNs, MaxCacheSize: 16 * 1024 * 1024, @@ -294,7 +372,7 @@ func TestOversizedChunks(t *testing.T) { } // Create a file-based checkpoint creator. - fc, err := NewFileCreator(filepath.Join(dir, "checkpoints"), ndb) + fc, err := checkpoint.NewFileCreator(filepath.Join(dir, "checkpoints"), ndb) require.NoError(err, "NewFileCreator") // Create a checkpoint and check that it has been created correctly. @@ -305,7 +383,7 @@ func TestOversizedChunks(t *testing.T) { require.Len(cp.Chunks, 100, "there should be the correct number of chunks") } -func TestPruneGapAfterCheckpointRestore(t *testing.T) { +func testPruneGapAfterCheckpointRestore(t *testing.T, new NodeDBFactory) { require := require.New(t) // Generate some data. @@ -315,13 +393,13 @@ func TestPruneGapAfterCheckpointRestore(t *testing.T) { // Create two databases, the first will contain everything while the second one will only // contain the first few versions. - ndb1, err := badgerDb.New(&db.Config{ + ndb1, err := new(&db.Config{ DB: filepath.Join(dir, "db1"), Namespace: testNs, }) require.NoError(err, "New") - ndb2, err := badgerDb.New(&db.Config{ + ndb2, err := new(&db.Config{ DB: filepath.Join(dir, "db2"), Namespace: testNs, }) @@ -393,7 +471,7 @@ func TestPruneGapAfterCheckpointRestore(t *testing.T) { } // Create a file-based checkpoint creator for the first database. - fc, err := NewFileCreator(filepath.Join(dir, "checkpoints"), ndb1) + fc, err := checkpoint.NewFileCreator(filepath.Join(dir, "checkpoints"), ndb1) require.NoError(err, "NewFileCreator") // Create a checkpoint and check that it has been created correctly. @@ -401,7 +479,7 @@ func TestPruneGapAfterCheckpointRestore(t *testing.T) { require.NoError(err, "CreateCheckpoint") // Restore checkpoints in the second database. - rs, err := NewRestorer(ndb2) + rs, err := checkpoint.NewRestorer(ndb2) require.NoError(err, "NewRestorer") err = ndb2.StartMultipartInsert(cp.Root.Version) @@ -409,7 +487,7 @@ func TestPruneGapAfterCheckpointRestore(t *testing.T) { err = rs.StartRestore(ctx, cp) require.NoError(err, "StartRestore") for i := 0; i < len(cp.Chunks); i++ { - var cm *ChunkMetadata + var cm *checkpoint.ChunkMetadata cm, err = cp.GetChunkMetadata(uint64(i)) require.NoError(err, "GetChunkMetadata") diff --git a/go/storage/mkvs/checkpoint/checkpointer_test.go b/go/storage/mkvs/checkpoint/checkpointer_test.go index c3fdbd660bb..8f04c93d793 100644 --- a/go/storage/mkvs/checkpoint/checkpointer_test.go +++ b/go/storage/mkvs/checkpoint/checkpointer_test.go @@ -10,19 +10,23 @@ import ( "github.com/stretchr/testify/require" + "github.com/oasisprotocol/oasis-core/go/common" "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" "github.com/oasisprotocol/oasis-core/go/storage/mkvs" db "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" badgerDb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/badger" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/rocksdb" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" ) +var testNs = common.NewTestNamespaceFromSeed([]byte("oasis mkvs checkpoint test ns"), 0) + const ( testCheckInterval = 50 * time.Millisecond testNumKept = 2 ) -func testCheckpointer(t *testing.T, earliestVersion, interval uint64, preExistingData bool) { +func testCheckpointer(t *testing.T, new func(cfg *db.Config) (db.NodeDB, error), earliestVersion, interval uint64, preExistingData bool) { require := require.New(t) ctx := context.Background() @@ -31,7 +35,7 @@ func testCheckpointer(t *testing.T, earliestVersion, interval uint64, preExistin require.NoError(err, "TempDir") defer os.RemoveAll(dir) - ndb, err := badgerDb.New(&db.Config{ + ndb, err := new(&db.Config{ DB: filepath.Join(dir, "db"), Namespace: testNs, MaxCacheSize: 16 * 1024 * 1024, @@ -166,20 +170,38 @@ func testCheckpointer(t *testing.T, earliestVersion, interval uint64, preExistin } } -func TestCheckpointer(t *testing.T) { +func TestRocksDbCheckpointer(t *testing.T) { + t.Run("Basic", func(t *testing.T) { + testCheckpointer(t, rocksdb.New, 0, 1, false) + }) + t.Run("NonZeroEarliestVersion", func(t *testing.T) { + testCheckpointer(t, rocksdb.New, 1000, 1, false) + }) + t.Run("NonZeroEarliestInitialVersion", func(t *testing.T) { + testCheckpointer(t, rocksdb.New, 100, 1, true) + }) + t.Run("MaybeUnderflow", func(t *testing.T) { + testCheckpointer(t, rocksdb.New, 5, 10, true) + }) + t.Run("ForceCheckpoint", func(t *testing.T) { + testCheckpointer(t, rocksdb.New, 0, 10, false) + }) +} + +func TestBadgerDbCheckpointer(t *testing.T) { t.Run("Basic", func(t *testing.T) { - testCheckpointer(t, 0, 1, false) + testCheckpointer(t, badgerDb.New, 0, 1, false) }) t.Run("NonZeroEarliestVersion", func(t *testing.T) { - testCheckpointer(t, 1000, 1, false) + testCheckpointer(t, badgerDb.New, 1000, 1, false) }) t.Run("NonZeroEarliestInitialVersion", func(t *testing.T) { - testCheckpointer(t, 100, 1, true) + testCheckpointer(t, badgerDb.New, 100, 1, true) }) t.Run("MaybeUnderflow", func(t *testing.T) { - testCheckpointer(t, 5, 10, true) + testCheckpointer(t, badgerDb.New, 5, 10, true) }) t.Run("ForceCheckpoint", func(t *testing.T) { - testCheckpointer(t, 0, 10, false) + testCheckpointer(t, badgerDb.New, 0, 10, false) }) } diff --git a/go/storage/mkvs/db/api/api.go b/go/storage/mkvs/db/api/api.go index 47fb0a06183..5f9a03d6c81 100644 --- a/go/storage/mkvs/db/api/api.go +++ b/go/storage/mkvs/db/api/api.go @@ -131,7 +131,7 @@ type NodeDB interface { Prune(ctx context.Context, version uint64) error // Size returns the size of the database in bytes. - Size() (int64, error) + Size() (uint64, error) // Sync syncs the database to disk. This is useful if the NoFsync option is used to explicitly // perform a sync. @@ -250,7 +250,7 @@ func (d *nopNodeDB) Prune(context.Context, uint64) error { return nil } -func (d *nopNodeDB) Size() (int64, error) { +func (d *nopNodeDB) Size() (uint64, error) { return 0, nil } diff --git a/go/storage/mkvs/db/badger/badger.go b/go/storage/mkvs/db/badger/badger.go index 04df00fe988..5ab2309bfc1 100644 --- a/go/storage/mkvs/db/badger/badger.go +++ b/go/storage/mkvs/db/badger/badger.go @@ -37,7 +37,7 @@ var ( // old root). // // Value is CBOR-serialized write log. - writeLogKeyFmt = keyformat.New(0x01, uint64(0), &typedHash{}, &typedHash{}) + writeLogKeyFmt = keyformat.New(0x01, uint64(0), &node.TypedHash{}, &node.TypedHash{}) // rootsMetadataKeyFmt is the key format for roots metadata. The key format is (version). // // Value is CBOR-serialized rootsMetadata. @@ -47,7 +47,7 @@ var ( // the finalized roots. They key format is (version, root). // // Value is CBOR-serialized []updatedNode. - rootUpdatedNodesKeyFmt = keyformat.New(0x03, uint64(0), &typedHash{}) + rootUpdatedNodesKeyFmt = keyformat.New(0x03, uint64(0), &node.TypedHash{}) // metadataKeyFmt is the key format for metadata. // // Value is CBOR-serialized metadata. @@ -58,11 +58,11 @@ var ( // with these entries. // // Value is empty. - multipartRestoreNodeLogKeyFmt = keyformat.New(0x05, &typedHash{}) + multipartRestoreNodeLogKeyFmt = keyformat.New(0x05, &node.TypedHash{}) // rootNodeKeyFmt is the key format for root nodes (typed node hash). // // Value is empty. - rootNodeKeyFmt = keyformat.New(0x06, &typedHash{}) + rootNodeKeyFmt = keyformat.New(0x06, &node.TypedHash{}) ) // New creates a new BadgerDB-backed node database. @@ -181,7 +181,7 @@ func (d *badgerNodeDB) sanityCheckNamespace(ns common.Namespace) error { } func (d *badgerNodeDB) checkRoot(txn *badger.Txn, root node.Root) error { - rootHash := typedHashFromRoot(root) + rootHash := node.TypedHashFromRoot(root) if _, err := txn.Get(rootNodeKeyFmt.Encode(&rootHash)); err != nil { switch err { case badger.ErrKeyNotFound: @@ -229,7 +229,7 @@ func (d *badgerNodeDB) cleanMultipartLocked(removeNodes bool) error { d.logger.Info("removing some nodes from a multipart restore") logged = true } - var hash typedHash + var hash node.TypedHash if !multipartRestoreNodeLogKeyFmt.Decode(key, &hash) { panic("mkvs/badger: bad iterator") } @@ -358,14 +358,14 @@ func (d *badgerNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node. type wlItem struct { depth uint8 - endRootHash typedHash + endRootHash node.TypedHash logKeys [][]byte - logRoots []typedHash + logRoots []node.TypedHash } // NOTE: We could use a proper deque, but as long as we keep the number of hops and // forks low, this should not be a problem. - queue := []*wlItem{{depth: 0, endRootHash: typedHashFromRoot(endRoot)}} - startRootHash := typedHashFromRoot(startRoot) + queue := []*wlItem{{depth: 0, endRootHash: node.TypedHashFromRoot(endRoot)}} + startRootHash := node.TypedHashFromRoot(startRoot) for len(queue) > 0 { if ctx.Err() != nil { return nil, ctx.Err() @@ -388,8 +388,8 @@ func (d *badgerNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node. item := it.Item() var decVersion uint64 - var decEndRootHash typedHash - var decStartRootHash typedHash + var decEndRootHash node.TypedHash + var decStartRootHash node.TypedHash if !writeLogKeyFmt.Decode(item.Key(), &decVersion, &decEndRootHash, &decStartRootHash) { // This should not happen as the Badger iterator should take care of it. @@ -524,7 +524,7 @@ func (d *badgerNodeDB) HasRoot(root node.Root) bool { panic(err) } - _, exists := rootsMeta.Roots[typedHashFromRoot(root)] + _, exists := rootsMeta.Roots[node.TypedHashFromRoot(root)] return exists } @@ -564,12 +564,12 @@ func (d *badgerNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo // Determine the set of finalized roots. Finalization is transitive, so if // a parent root is finalized the child should be considered finalized too. - finalizedRoots := make(map[typedHash]bool) + finalizedRoots := make(map[node.TypedHash]bool) for _, root := range roots { if root.Version != version { return fmt.Errorf("mkvs/badger: roots to finalize don't have matching versions") } - finalizedRoots[typedHashFromRoot(root)] = true + finalizedRoots[node.TypedHashFromRoot(root)] = true } var rootsChanged bool @@ -650,12 +650,15 @@ func (d *badgerNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo // Remove write logs for the non-finalized root. if !d.discardWriteLogs { + fmt.Println("DISCARDING HERE badger") if err = func() error { rootWriteLogsPrefix := writeLogKeyFmt.Encode(version, &rootHash) + fmt.Println("Prefix badger: ", version, rootWriteLogsPrefix) wit := tx.NewIterator(badger.IteratorOptions{Prefix: rootWriteLogsPrefix}) defer wit.Close() for wit.Rewind(); wit.Valid(); wit.Next() { + fmt.Println("DELETING HERE badger", version, wit.Item().Key()) if err = versionBatch.Delete(wit.Item().KeyCopy(nil)); err != nil { return err } @@ -907,9 +910,9 @@ func (d *badgerNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) ( }, nil } -func (d *badgerNodeDB) Size() (int64, error) { +func (d *badgerNodeDB) Size() (uint64, error) { lsm, vlog := d.db.Size() - return lsm + vlog, nil + return uint64(lsm) + uint64(vlog), nil } func (d *badgerNodeDB) Sync() error { @@ -1013,7 +1016,7 @@ func (ba *badgerBatch) Commit(root node.Root) error { return err } - rootHash := typedHashFromRoot(root) + rootHash := node.TypedHashFromRoot(root) if err = ba.bat.Set(rootNodeKeyFmt.Encode(&rootHash), []byte{}); err != nil { return err } @@ -1034,7 +1037,7 @@ func (ba *badgerBatch) Commit(root node.Root) error { } } else { // Create root with no derived roots. - rootsMeta.Roots[rootHash] = []typedHash{} + rootsMeta.Roots[rootHash] = []node.TypedHash{} if err = rootsMeta.save(tx); err != nil { return fmt.Errorf("mkvs/badger: failed to save roots metadata: %w", err) @@ -1049,7 +1052,7 @@ func (ba *badgerBatch) Commit(root node.Root) error { } } else { // Update the root link for the old root. - oldRootHash := typedHashFromRoot(ba.oldRoot) + oldRootHash := node.TypedHashFromRoot(ba.oldRoot) if !ba.oldRoot.Hash.IsEmpty() { if ba.oldRoot.Version < ba.db.meta.getEarliestVersion() && ba.oldRoot.Version != root.Version { return api.ErrPreviousVersionMismatch @@ -1136,7 +1139,7 @@ func (s *badgerSubtree) PutNode(_ node.Depth, ptr *node.Pointer) error { nodeKey := nodeKeyFmt.Encode(&h) if s.batch.multipartNodes != nil { if _, err = s.batch.readTxn.Get(nodeKey); err != nil && errors.Is(err, badger.ErrKeyNotFound) { - th := typedHashFromParts(node.RootTypeInvalid, h) + th := node.TypedHashFromParts(node.RootTypeInvalid, h) if err = s.batch.multipartNodes.Set(multipartRestoreNodeLogKeyFmt.Encode(&th), []byte{}); err != nil { return err } diff --git a/go/storage/mkvs/db/badger/badger_test.go b/go/storage/mkvs/db/badger/badger_test.go index 31b7ba825a7..0d8c93ecb19 100644 --- a/go/storage/mkvs/db/badger/badger_test.go +++ b/go/storage/mkvs/db/badger/badger_test.go @@ -293,89 +293,3 @@ func testExistingNodes(ctx *test) { ctx.require.NoError(err, "AbortMultipartInsert()") verifyNodes(ctx.require, ctx.badgerdb, ctx.ckNodes) } - -func TestVersionChecks(t *testing.T) { - require := require.New(t) - ndb, err := New(dbCfg) - require.NoError(err, "New()") - defer ndb.Close() - badgerdb := ndb.(*badgerNodeDB) - - err = badgerdb.StartMultipartInsert(0) - require.Error(err, "StartMultipartInsert(0)") - - err = badgerdb.StartMultipartInsert(42) - require.NoError(err, "StartMultipartInsert(42)") - err = badgerdb.StartMultipartInsert(44) - require.Error(err, "StartMultipartInsert(44)") - - root := node.Root{} - _, err = badgerdb.NewBatch(root, 0, false) // Normal chunks not allowed during multipart. - require.Error(err, "NewBatch(.., 0, false)") - _, err = badgerdb.NewBatch(root, 13, true) - require.Error(err, "NewBatch(.., 13, true)") - batch, err := badgerdb.NewBatch(root, 42, true) - require.NoError(err, "NewBatch(.., 42, true)") - defer batch.Reset() - - err = batch.Commit(root) - require.Error(err, "Commit(Root{0})") -} - -func TestReadOnlyBatch(t *testing.T) { - require := require.New(t) - - // No way to initialize a readonly-database, so it needs to be created rw first. - // This means we need persistence. - dir, err := os.MkdirTemp("", "oasis-storage-database-test") - require.NoError(err, "TempDir()") - defer os.RemoveAll(dir) - - readonlyCfg := *dbCfg - readonlyCfg.MemoryOnly = false - readonlyCfg.ReadOnly = false - readonlyCfg.DB = dir - - func() { - ndb, errRw := New(&readonlyCfg) - require.NoError(errRw, "New() - 1") - defer ndb.Close() - }() - - readonlyCfg.ReadOnly = true - ndb, err := New(&readonlyCfg) - require.NoError(err, "New() - 2") - defer ndb.Close() - badgerdb := ndb.(*badgerNodeDB) - - _, err = badgerdb.NewBatch(node.Root{}, 13, false) - require.Error(err, "NewBatch()") -} - -func TestFinalizeBasic(t *testing.T) { - ctx := context.Background() - require := require.New(t) - - offset := func(vals [][]byte) [][]byte { - ret := make([][]byte, 0, len(vals)) - for _, val := range vals { - ret = append(ret, append(val, 0x0a)) - } - return ret - } - - ndb, err := New(dbCfg) - require.NoError(err, "New()") - defer ndb.Close() - - root1 := fillDB(ctx, require, testValues, nil, 1, 2, ndb) - err = ndb.Finalize([]node.Root{root1}) - require.NoError(err, "Finalize({root1})") - - // Finalize a corrupted root. - currentValues := offset(testValues) - root2 := fillDB(ctx, require, currentValues, &root1, 2, 3, ndb) - root2.Hash[3]++ - err = ndb.Finalize([]node.Root{root2}) - require.Errorf(err, "mkvs: root not found", "Finalize({root2-broken})") -} diff --git a/go/storage/mkvs/db/badger/check.go b/go/storage/mkvs/db/badger/check.go index 9630abbfd11..8fa0f3817b7 100644 --- a/go/storage/mkvs/db/badger/check.go +++ b/go/storage/mkvs/db/badger/check.go @@ -123,7 +123,7 @@ func checkSanityInternal(ctx context.Context, db *badgerNodeDB, display DisplayH hashes: map[hash.Hash]*list.Element{}, } - lastRoots := make(map[typedHash]uint64) + lastRoots := make(map[node.TypedHash]uint64) for it.Seek(lastRootsMetadataKey); it.Valid(); it.Next() { rootsMeta := &rootsMetadata{} if !rootsMetadataKeyFmt.Decode(it.Item().Key(), &version) { @@ -187,7 +187,7 @@ func checkSanityInternal(ctx context.Context, db *badgerNodeDB, display DisplayH defer it.Close() for it.Rewind(); it.Valid(); it.Next() { - var srcRoot, dstRoot typedHash + var srcRoot, dstRoot node.TypedHash if !writeLogKeyFmt.Decode(it.Item().Key(), &version, &dstRoot, &srcRoot) { return fmt.Errorf("mkvs/badger/check: undecodable write log key (%v) at item version %d", it.Item().Key(), it.Item().Version()) } diff --git a/go/storage/mkvs/db/badger/metadata.go b/go/storage/mkvs/db/badger/metadata.go index e265891ebc1..7be82a9c14f 100644 --- a/go/storage/mkvs/db/badger/metadata.go +++ b/go/storage/mkvs/db/badger/metadata.go @@ -9,6 +9,7 @@ import ( "github.com/oasisprotocol/oasis-core/go/common" "github.com/oasisprotocol/oasis-core/go/common/cbor" "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" ) // serializedMetadata is the on-disk serialized metadata. @@ -115,7 +116,7 @@ type rootsMetadata struct { _ struct{} `cbor:",toarray"` // Roots is the map of a root created in a version to any derived roots (in this or later versions). - Roots map[typedHash][]typedHash + Roots map[node.TypedHash][]node.TypedHash // version is the version this metadata is for. version uint64 @@ -131,7 +132,7 @@ func loadRootsMetadata(tx *badger.Txn, version uint64) (*rootsMetadata, error) { return nil, fmt.Errorf("mkvs/badger: error reading roots metadata: %w", err) } case badger.ErrKeyNotFound: - rootsMeta.Roots = make(map[typedHash][]typedHash) + rootsMeta.Roots = make(map[node.TypedHash][]node.TypedHash) default: return nil, fmt.Errorf("mkvs/badger: error reading roots metadata: %w", err) } diff --git a/go/storage/mkvs/db/badger/migrate.go b/go/storage/mkvs/db/badger/migrate.go index a0d6efd05c2..295d5fb31f6 100644 --- a/go/storage/mkvs/db/badger/migrate.go +++ b/go/storage/mkvs/db/badger/migrate.go @@ -339,7 +339,7 @@ func (v4 *v4Migrator) keyRootsMetadata(item *badger.Item) error { // nolint: goc // Create root typing keys. for h, types := range plainRoots { for t := range types { - th := typedHashFromParts(t, h) + th := node.TypedHashFromParts(t, h) entry := badger.NewEntry( v4RootNodeKeyFmt.Encode(&th), []byte{}, @@ -352,15 +352,15 @@ func (v4 *v4Migrator) keyRootsMetadata(item *badger.Item) error { // nolint: goc // Build new roots structure. var newRoots v4RootsMetadata - newRoots.Roots = map[typedHash][]typedHash{} + newRoots.Roots = map[node.TypedHash][]node.TypedHash{} for root, chain := range rootsMeta.Roots { for typ := range plainRoots[root] { - arr := make([]typedHash, 0, len(chain)) + arr := make([]node.TypedHash, 0, len(chain)) for _, droot := range chain { - th := typedHashFromParts(typ, droot) + th := node.TypedHashFromParts(typ, droot) arr = append(arr, th) } - th := typedHashFromParts(typ, root) + th := node.TypedHashFromParts(typ, root) newRoots.Roots[th] = arr } } @@ -380,7 +380,7 @@ func (v4 *v4Migrator) keyRootsMetadata(item *badger.Item) error { // nolint: goc func (v4 *v4Migrator) keyWriteLog(item *badger.Item) error { var version uint64 var h1, h2 hash.Hash - var th1, th2 typedHash + var th1, th2 node.TypedHash if !v3WriteLogKeyFmt.Decode(item.Key(), &version, &h1, &h2) { return fmt.Errorf("error decoding writelog key") } @@ -449,7 +449,7 @@ func (v4 *v4Migrator) keyRootUpdatedNodes(item *badger.Item) error { } if item.IsDeletedOrExpired() { for _, typ := range types { - th := typedHashFromParts(typ, h1) + th := node.TypedHashFromParts(typ, h1) key := v4RootUpdatedNodesKeyFmt.Encode(version, &th) if err = v4.changeBatch.DeleteAt(key, item.Version()); err != nil { return fmt.Errorf("error transforming removed updated nodes list for root %v: %w", th, err) @@ -475,7 +475,7 @@ func (v4 *v4Migrator) keyRootUpdatedNodes(item *badger.Item) error { } for _, typ := range types { - th := typedHashFromParts(typ, h1) + th := node.TypedHashFromParts(typ, h1) if v4.meta.MultipartActive { entry := badger.NewEntry( @@ -521,7 +521,7 @@ func (v4 *v4Migrator) keyMultipartRestoreNodeLog(item *badger.Item) error { if err := v4.changeBatch.DeleteAt(item.KeyCopy(nil), item.Version()); err != nil { return fmt.Errorf("can't delete old multipart restore log key for %v: %w", h, err) } - th := typedHashFromParts(node.RootTypeInvalid, h) + th := node.TypedHashFromParts(node.RootTypeInvalid, h) entry := badger.NewEntry( v4MultipartRestoreNodeLogKeyFmt.Encode(&th), []byte{}, @@ -701,16 +701,16 @@ func (v4 *v4Migrator) Migrate() (rversion uint64, rerr error) { } type v5MigratedRoot struct { - Hash typedHash `json:"hash"` - Version uint64 `json:"version"` + Hash node.TypedHash `json:"hash"` + Version uint64 `json:"version"` } type v5MigratorMetadata struct { migrationCommonMeta - LastMigratedVersion *uint64 `json:"last_migrated_version"` - LastMigratedRoots map[typedHash]v5MigratedRoot `json:"last_migrated_roots"` - LastPrunedVersion *uint64 `json:"last_pruned_version"` + LastMigratedVersion *uint64 `json:"last_migrated_version"` + LastMigratedRoots map[node.TypedHash]v5MigratedRoot `json:"last_migrated_roots"` + LastPrunedVersion *uint64 `json:"last_pruned_version"` } func (m *v5MigratorMetadata) load(db *badger.DB) error { @@ -952,7 +952,7 @@ func (v5 *v5Migrator) migrateNode(h hash.Hash, version uint64) (*hash.Hash, erro return &newHash, nil } -func (v5 *v5Migrator) migrateWriteLog(oldSrcRoot, oldDstRoot, newSrcRoot typedHash, newDstRoot v5MigratedRoot) error { +func (v5 *v5Migrator) migrateWriteLog(oldSrcRoot, oldDstRoot, newSrcRoot node.TypedHash, newDstRoot v5MigratedRoot) error { item, err := v5.readTxn.Get(v4WriteLogKeyFmt.Encode(newDstRoot.Version, &oldDstRoot, &oldSrcRoot)) switch err { case nil: @@ -983,7 +983,7 @@ func (v5 *v5Migrator) migrateWriteLog(oldSrcRoot, oldDstRoot, newSrcRoot typedHa return nil } -func (v5 *v5Migrator) migrateVersion(version uint64, migratedRoots map[typedHash]v5MigratedRoot) (bool, error) { +func (v5 *v5Migrator) migrateVersion(version uint64, migratedRoots map[node.TypedHash]v5MigratedRoot) (bool, error) { defer func() { v5.readTxn.Discard() v5.readTxn = v5.db.db.NewTransactionAt(maxTimestamp, false) @@ -1008,7 +1008,7 @@ func (v5 *v5Migrator) migrateVersion(version uint64, migratedRoots map[typedHash return false, fmt.Errorf("error decoding roots metadata for version %d: %w", version, err) } - newRoots := make(map[typedHash][]typedHash) + newRoots := make(map[node.TypedHash][]node.TypedHash) for root := range roots.Roots { // Migrate the tree (if not empty). var newRootHash hash.Hash @@ -1023,14 +1023,14 @@ func (v5 *v5Migrator) migrateVersion(version uint64, migratedRoots map[typedHash newRootHash.Empty() } - newRoot := typedHashFromParts(root.Type(), newRootHash) - newRoots[newRoot] = []typedHash{} + newRoot := node.TypedHashFromParts(root.Type(), newRootHash) + newRoots[newRoot] = []node.TypedHash{} migratedRoots[root] = v5MigratedRoot{Hash: newRoot, Version: version} // Check for a write log from empty root. var emptyHash hash.Hash emptyHash.Empty() - emptyRoot := typedHashFromParts(root.Type(), emptyHash) + emptyRoot := node.TypedHashFromParts(root.Type(), emptyHash) if err = v5.migrateWriteLog(emptyRoot, root, emptyRoot, migratedRoots[root]); err != nil { return false, err @@ -1227,7 +1227,7 @@ func (v5 *v5Migrator) pruneVersion(version uint64) error { return v5.flush(false) } -func (v5 *v5Migrator) pruneWriteLog(version uint64, oldRoot typedHash) error { +func (v5 *v5Migrator) pruneWriteLog(version uint64, oldRoot node.TypedHash) error { prefix := v4WriteLogKeyFmt.Encode(version, &oldRoot) it := v5.readTxn.NewIterator(badger.IteratorOptions{Prefix: prefix}) defer it.Close() @@ -1307,7 +1307,7 @@ func (v5 *v5Migrator) Migrate() (rversion uint64, rerr error) { v4RootsMetadataKeyFmt.Decode(it.Item().Key(), &lastVersion) it.Close() - migratedRoots := make(map[typedHash]v5MigratedRoot) + migratedRoots := make(map[node.TypedHash]v5MigratedRoot) if lv := v5.meta.LastMigratedVersion; lv != nil { // Resume at the following version. lastVersion = *lv - 1 diff --git a/go/storage/mkvs/db/db_test.go b/go/storage/mkvs/db/db_test.go index 16d8f09a5f9..d6d7c078341 100644 --- a/go/storage/mkvs/db/db_test.go +++ b/go/storage/mkvs/db/db_test.go @@ -1,14 +1,20 @@ -package db +package db_test import ( "context" "fmt" + "os" + "strconv" "testing" "github.com/stretchr/testify/require" + "github.com/oasisprotocol/oasis-core/go/common" "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" + badgerDb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/badger" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/rocksdb" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/writelog" ) @@ -17,6 +23,24 @@ const ( writeLogSize = 100 ) +var ( + testNs = common.NewTestNamespaceFromSeed([]byte("oasis db test ns"), 0) + dbCfg = &api.Config{ + Namespace: testNs, + MaxCacheSize: 16 * 1024 * 1024, + NoFsync: true, + MemoryOnly: true, + } + testValues = [][]byte{ + []byte("colorless green ideas sleep furiously"), + []byte("excepting understandable chairs piously"), + []byte("at the prickle for rainbow hoovering"), + } +) + +// NodeDBFactory is a function that creates a new node database for the given config. +type NodeDBFactory func(cfg *api.Config) (api.NodeDB, error) + func makeWriteLog() writelog.WriteLog { wl := make(writelog.WriteLog, writeLogSize) @@ -85,3 +109,203 @@ func TestHashedWriteLog(t *testing.T) { } require.Equal(t, i, len(wl)) } + +func TestBadgerBackend(t *testing.T) { + testBackend(t, func(t *testing.T) (NodeDBFactory, func()) { + // Create a new random temporary directory under /tmp. + dir, err := os.MkdirTemp("", "mkvs.test.badger") + require.NoError(t, err, "TempDir") + + // Create a Badger-backed Node DB factory. + factory := func(cfg *api.Config) (api.NodeDB, error) { + if cfg.DB == "" { + cfg.DB = dir + } + return badgerDb.New(cfg) + } + + cleanup := func() { + os.RemoveAll(dir) + } + + return factory, cleanup + }, nil) +} + +func TestRocksDBBackend(t *testing.T) { + testBackend(t, func(t *testing.T) (NodeDBFactory, func()) { + // Create a new random temporary directory under /tmp. + dir, err := os.MkdirTemp("", "mkvs.test.rocksdb") + require.NoError(t, err, "TempDir") + + // Create a RocksDB-backed Node DB factory. + factory := func(cfg *api.Config) (api.NodeDB, error) { + if cfg.DB == "" { + cfg.DB = dir + } + return rocksdb.New(cfg) + } + + cleanup := func() { + os.RemoveAll(dir) + } + + return factory, cleanup + }, nil) +} + +func testBackend( + t *testing.T, + initBackend func(t *testing.T) (NodeDBFactory, func()), + skipTests []string, +) { + tests := []struct { + name string + fn func(*testing.T, NodeDBFactory) + }{ + {"FinalizeBasic", testFinalizeBasic}, + {"VersionChecks", testVersionChecks}, + {"ReadOnlyBatch", testReadOnlyBatch}, + } + + skipMap := make(map[string]bool, len(skipTests)) + for _, name := range skipTests { + skipMap[name] = true + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if skipMap[tc.name] { + t.Skip("skipping test for this backend") + } + + factory, cleanup := initBackend(t) + defer cleanup() + tc.fn(t, factory) + }) + } +} + +func fillDB( + ctx context.Context, + require *require.Assertions, + values [][]byte, + prevRoot *node.Root, + version, commitVersion uint64, + ndb api.NodeDB, +) node.Root { + if prevRoot == nil { + emptyRoot := node.Root{ + Namespace: testNs, + Version: version, + Type: node.RootTypeState, + } + emptyRoot.Hash.Empty() + prevRoot = &emptyRoot + } + + tree := mkvs.NewWithRoot(nil, ndb, *prevRoot) + require.NotNil(tree, "NewWithRoot()") + + var wl writelog.WriteLog + for i, val := range values { + wl = append(wl, writelog.LogEntry{Key: []byte(strconv.Itoa(i)), Value: val}) + } + + err := tree.ApplyWriteLog(ctx, writelog.NewStaticIterator(wl)) + require.NoError(err, "ApplyWriteLog()") + + _, hash, err := tree.Commit(ctx, testNs, commitVersion) + require.NoError(err, "Commit()") + + return node.Root{ + Namespace: testNs, + Version: version + 1, + Type: node.RootTypeState, + Hash: hash, + } +} + +func testFinalizeBasic(t *testing.T, new NodeDBFactory) { + ctx := context.Background() + require := require.New(t) + + offset := func(vals [][]byte) [][]byte { + ret := make([][]byte, 0, len(vals)) + for _, val := range vals { + ret = append(ret, append(val, 0x0a)) + } + return ret + } + + ndb, err := new(dbCfg) + require.NoError(err, "New()") + defer ndb.Close() + + root1 := fillDB(ctx, require, testValues, nil, 1, 2, ndb) + err = ndb.Finalize([]node.Root{root1}) + require.NoError(err, "Finalize({root1})") + + // Finalize a corrupted root. + currentValues := offset(testValues) + root2 := fillDB(ctx, require, currentValues, &root1, 2, 3, ndb) + root2.Hash[3]++ + err = ndb.Finalize([]node.Root{root2}) + require.Errorf(err, "mkvs: root not found", "Finalize({root2-broken})") +} + +func testVersionChecks(t *testing.T, new NodeDBFactory) { + require := require.New(t) + ndb, err := new(dbCfg) + require.NoError(err, "New()") + defer ndb.Close() + + err = ndb.StartMultipartInsert(0) + require.Error(err, "StartMultipartInsert(0)") + + err = ndb.StartMultipartInsert(42) + require.NoError(err, "StartMultipartInsert(42)") + err = ndb.StartMultipartInsert(44) + require.Error(err, "StartMultipartInsert(44)") + + root := node.Root{} + _, err = ndb.NewBatch(root, 0, false) // Normal chunks not allowed during multipart. + require.Error(err, "NewBatch(.., 0, false)") + _, err = ndb.NewBatch(root, 13, true) + require.Error(err, "NewBatch(.., 13, true)") + batch, err := ndb.NewBatch(root, 42, true) + require.NoError(err, "NewBatch(.., 42, true)") + defer batch.Reset() + + err = batch.Commit(root) + require.Error(err, "Commit(Root{0})") +} + +func testReadOnlyBatch(t *testing.T, new NodeDBFactory) { + require := require.New(t) + + // No way to initialize a readonly-database, so it needs to be created rw first. + // This means we need persistence. + dir, err := os.MkdirTemp("", "oasis-storage-database-test") + require.NoError(err, "TempDir()") + defer os.RemoveAll(dir) + + readonlyCfg := *dbCfg + readonlyCfg.MemoryOnly = false + readonlyCfg.ReadOnly = false + readonlyCfg.DB = dir + + func() { + ndb, errRw := new(&readonlyCfg) + require.NoError(errRw, "New() - 1") + defer ndb.Close() + }() + + readonlyCfg.ReadOnly = true + ndb, err := new(&readonlyCfg) + require.NoError(err, "New() - 2") + defer ndb.Close() + + _, err = ndb.NewBatch(node.Root{}, 13, false) + require.Error(err, "NewBatch()") +} diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go new file mode 100644 index 00000000000..b5e5c640169 --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -0,0 +1,246 @@ +package rocksdb + +import ( + "bytes" + "fmt" + + "github.com/linxGnu/grocksdb" + + "github.com/oasisprotocol/oasis-core/go/common/cbor" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/writelog" +) + +var _ api.Batch = (*rocksdbBatch)(nil) + +type rocksdbBatch struct { + api.BaseBatch + + db *rocksdbNodeDB + bat *grocksdb.WriteBatch + multipartNodes *grocksdb.WriteBatch + + oldRoot node.Root + chunk bool + + version uint64 + + writeLog writelog.WriteLog + annotations writelog.Annotations + updatedNodes []updatedNode +} + +// Commit implements api.Batch. +func (ba *rocksdbBatch) Commit(root node.Root) error { + ba.db.metaUpdateLock.Lock() + defer ba.db.metaUpdateLock.Unlock() + + if ba.db.multipartVersion != multipartVersionNone && ba.db.multipartVersion != root.Version { + return api.ErrInvalidMultipartVersion + } + + if err := ba.db.sanityCheckNamespace(root.Namespace); err != nil { + return err + } + if !root.Follows(&ba.oldRoot) { + return api.ErrRootMustFollowOld + } + + // Make sure that the version that we try to commit into has not yet been finalized. + lastFinalizedVersion, exists := ba.db.meta.getLastFinalizedVersion() + if exists && lastFinalizedVersion >= root.Version { + return api.ErrAlreadyFinalized + } + + rootsMeta, err := loadRootsMetadata(ba.db.db, root.Version) + if err != nil { + return err + } + + // cf := ba.db.getColumnFamilyForRoot(root) + + rootHash := node.TypedHashFromRoot(root) + ts := timestampFromVersion(root.Version) + ba.bat.PutCFWithTS(ba.db.cfNode, rootNodeKeyFmt.Encode(&rootHash), ts[:], []byte{}) + if ba.multipartNodes != nil { + fmt.Println("putting there") + ba.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&rootHash), []byte{}) + } + + if rootsMeta.Roots[rootHash] != nil { + // Root already exists, no need to do anything since if the hash matches, everything will + // be identical and we would just be duplicating work. + // + // If we are importing a chunk, there can be multiple commits for the same root. + if !ba.chunk { + ba.Reset() + return ba.BaseBatch.Commit(root) + } + } else { + // Create root with no derived roots. + rootsMeta.Roots[rootHash] = []node.TypedHash{} + rootsMeta.save(ba.bat) + } + + if ba.chunk { + // Skip most of metadata updates if we are just importing chunks. + key := rootUpdatedNodesKeyFmt.Encode(root.Version, &rootHash) + ba.bat.Put(key, cbor.Marshal([]updatedNode{})) + } else { + // Update the root link for the old root. + oldRootHash := node.TypedHashFromRoot(ba.oldRoot) + if !ba.oldRoot.Hash.IsEmpty() { + if ba.oldRoot.Version < ba.db.meta.getEarliestVersion() && ba.oldRoot.Version != root.Version { + return api.ErrPreviousVersionMismatch + } + + // TODO: LongKeys. + // Old code re-loaded loadRootsMetadata here (which was saved in line 84). However i think this is not needed. + // More-over we lose the updates here, since the batch was not yet submitted, this differs with badger transaction + // semantics. Maybe we should use transactions here, idk. + var oldRootsMeta *rootsMetadata + oldRootsMeta, err = loadRootsMetadata(ba.db.db, ba.oldRoot.Version) + if err != nil { + return err + } + // Check if overridden in the current WriteBatch. + // TODO: this is probably not needed, just pick rootsMeta here? + wbIter := ba.bat.NewIterator() + for { + if !wbIter.Next() { + break + } + rec := wbIter.Record() + if bytes.Equal(rec.Key, rootsMetadataKeyFmt.Encode(ba.oldRoot.Version)) { + if rec.Type == grocksdb.WriteBatchValueRecord { + if err = cbor.Unmarshal(rec.Value, &oldRootsMeta); err != nil { + panic(err) + } + } + } + } + + if _, ok := oldRootsMeta.Roots[oldRootHash]; !ok { + return api.ErrRootNotFound + } + + oldRootsMeta.Roots[oldRootHash] = append(oldRootsMeta.Roots[oldRootHash], rootHash) + oldRootsMeta.save(ba.bat) + } + + // Store updated nodes (only needed until the version is finalized). + key := rootUpdatedNodesKeyFmt.Encode(root.Version, &rootHash) + ba.bat.Put(key, cbor.Marshal(ba.updatedNodes)) + + // Store write log. + if ba.writeLog != nil && ba.annotations != nil { + log := api.MakeHashedDBWriteLog(ba.writeLog, ba.annotations) + bytes := cbor.Marshal(log) + key := writeLogKeyFmt.Encode(root.Version, &rootHash, &oldRootHash) + ba.bat.PutCFWithTS(ba.db.cfNode, key, ts[:], bytes) + } + } + + // Flush node updates. + if ba.multipartNodes != nil { + if err = ba.db.db.Write(defaultWriteOptions, ba.multipartNodes); err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to flush node log batch: %w", err) + } + } + if err = ba.db.db.Write(defaultWriteOptions, ba.bat); err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to flush batch: %w", err) + } + + ba.writeLog = nil + ba.annotations = nil + ba.updatedNodes = nil + + return ba.BaseBatch.Commit(root) +} + +// MaybeStartSubtree implements api.Batch. +func (ba *rocksdbBatch) MaybeStartSubtree(subtree api.Subtree, depth node.Depth, subtreeRoot *node.Pointer) api.Subtree { + if subtree == nil { + return &rocksdbSubtree{batch: ba} + } + return subtree +} + +// PutWriteLog implements api.Batch. +func (ba *rocksdbBatch) PutWriteLog(writeLog writelog.WriteLog, annotations writelog.Annotations) error { + if ba.chunk { + return fmt.Errorf("mkvs/rocksdb: cannot put write log in chunk mode") + } + if ba.db.discardWriteLogs { + return nil + } + + ba.writeLog = writeLog + ba.annotations = annotations + return nil +} + +// RemoveNodes implements api.Batch. +func (ba *rocksdbBatch) RemoveNodes(nodes []node.Node) error { + if ba.chunk { + return fmt.Errorf("mkvs/rocksdb: cannot remove nodes in chunk mode") + } + + for _, n := range nodes { + ba.updatedNodes = append(ba.updatedNodes, updatedNode{ + Removed: true, + Hash: n.GetHash(), + }) + } + return nil +} + +// Reset implements api.Batch. +func (ba *rocksdbBatch) Reset() { + ba.bat.Destroy() + if ba.multipartNodes != nil { + ba.multipartNodes.Destroy() + } + ba.writeLog = nil + ba.annotations = nil + ba.updatedNodes = nil +} + +type rocksdbSubtree struct { + batch *rocksdbBatch +} + +func (s *rocksdbSubtree) PutNode(_ node.Depth, ptr *node.Pointer) error { + data, err := ptr.Node.MarshalBinary() + if err != nil { + return err + } + + h := ptr.Node.GetHash() + s.batch.updatedNodes = append(s.batch.updatedNodes, updatedNode{Hash: h}) + nodeKey := nodeKeyFmt.Encode(&h) + if s.batch.multipartNodes != nil { + item, err := s.batch.db.db.GetCF(timestampReadOptions(s.batch.version), s.batch.db.cfNode, nodeKey) + if err != nil { + return err + } + defer item.Free() + if !item.Exists() { + th := node.TypedHashFromParts(node.RootTypeInvalid, h) + s.batch.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&th), []byte{}) + } + } + + ts := timestampFromVersion(s.batch.version) + s.batch.bat.PutCFWithTS(s.batch.db.cfNode, nodeKey, ts[:], data) + return nil +} + +func (s *rocksdbSubtree) VisitCleanNode(node.Depth, *node.Pointer) error { + return nil +} + +func (s *rocksdbSubtree) Commit() error { + return nil +} diff --git a/go/storage/mkvs/db/rocksdb/iterator.go b/go/storage/mkvs/db/rocksdb/iterator.go new file mode 100644 index 00000000000..8083aef602b --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/iterator.go @@ -0,0 +1,153 @@ +package rocksdb + +import ( + "bytes" + "slices" + + "github.com/linxGnu/grocksdb" +) + +type iterator struct { + source *grocksdb.Iterator + start, end []byte + reverse bool + invalid bool +} + +// TODO: add support for prefix, on valid, check if prefix matches. +func newIterator(source *grocksdb.Iterator, start, end []byte, reverse bool) *iterator { + switch reverse { + case false: + if start == nil { + source.SeekToFirst() + } else { + source.Seek(start) + } + case true: + if end == nil { + source.SeekToLast() + } else { + source.Seek(end) + + if source.Valid() { + // We are either at the matching key, or the next key. + eoaKey := readOnlySlice(source.Key()) + if bytes.Compare(end, eoaKey) <= 0 { // end == aoaKey, or end < eaoKey + // End is exclusive, so move to the previous key. + source.Prev() + } + } else { + // Past the end of the db, move to the last key. + source.SeekToLast() + } + } + + } + + return &iterator{ + source: source, + start: start, + end: end, + reverse: reverse, + invalid: !source.Valid(), + } +} + +func readOnlySlice(s *grocksdb.Slice) []byte { + if !s.Exists() { + return nil + } + + return s.Data() +} + +// copyAndFreeSlice will copy a given RocksDB slice and free it. If the slice does +// not exist, will be returned. +func copyAndFreeSlice(s *grocksdb.Slice) []byte { + defer s.Free() + if !s.Exists() { + return nil + } + + return slices.Clone(s.Data()) +} + +func (itr *iterator) Valid() bool { + // once invalid, forever invalid + if itr.invalid { + return false + } + + // if source has error, consider it invalid + if err := itr.source.Err(); err != nil { + itr.invalid = true + return false + } + + // if source is invalid, consider it invalid + if !itr.source.Valid() { + itr.invalid = true + return false + } + + // if key is at the end or past it, consider it invalid + start := itr.start + end := itr.end + key := readOnlySlice(itr.source.Key()) + + if itr.reverse { + if start != nil && bytes.Compare(key, start) < 0 { + itr.invalid = true + return false + } + } else { + if end != nil && bytes.Compare(end, key) <= 0 { + itr.invalid = true + return false + } + } + + return true +} + +func (itr *iterator) Key() []byte { + itr.assertIsValid() + return copyAndFreeSlice(itr.source.Key()) +} + +func (itr *iterator) Value() []byte { + itr.assertIsValid() + return copyAndFreeSlice(itr.source.Value()) +} + +func (itr iterator) Next() bool { + if itr.invalid { + return false + } + + if itr.reverse { + itr.source.Prev() + } else { + itr.source.Next() + } + + return itr.Valid() +} + +func (itr *iterator) Error() error { + return itr.source.Err() +} + +func (itr *iterator) Close() { + if itr.source != nil { + itr.source.Close() + } + itr.source = nil + itr.invalid = true +} + +func (itr *iterator) assertIsValid() { + if itr.invalid { + panic("iterator is invalid") + } +} diff --git a/go/storage/mkvs/db/rocksdb/metadata.go b/go/storage/mkvs/db/rocksdb/metadata.go new file mode 100644 index 00000000000..9935d16c03b --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/metadata.go @@ -0,0 +1,153 @@ +package rocksdb + +import ( + "fmt" + "sync" + + "github.com/linxGnu/grocksdb" + + "github.com/oasisprotocol/oasis-core/go/common" + "github.com/oasisprotocol/oasis-core/go/common/cbor" + "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" +) + +// serializedMetadata is the on-disk serialized metadata. +type serializedMetadata struct { + // Version is the database schema version. + Version uint64 `json:"version"` + // Namespace is the namespace this database is for. + Namespace common.Namespace `json:"namespace"` + + // EarliestVersion is the earliest version. + EarliestVersion uint64 `json:"earliest_version"` + // LastFinalizedVersion is the last finalized version. + LastFinalizedVersion *uint64 `json:"last_finalized_version"` + // MultipartVersion is the version for the in-progress multipart restore, or 0 if none was in progress. + MultipartVersion uint64 `json:"multipart_version"` +} + +// metadata is the database metadata. +type metadata struct { + sync.RWMutex + + value serializedMetadata +} + +func (m *metadata) getEarliestVersion() uint64 { + m.RLock() + defer m.RUnlock() + + return m.value.EarliestVersion +} + +func (m *metadata) setEarliestVersion(batch *grocksdb.WriteBatch, version uint64) { + m.Lock() + defer m.Unlock() + + // The earliest version can only increase, not decrease. + if version < m.value.EarliestVersion { + return + } + + m.value.EarliestVersion = version + m.saveB(batch) +} + +func (m *metadata) getLastFinalizedVersion() (uint64, bool) { + m.RLock() + defer m.RUnlock() + + if m.value.LastFinalizedVersion == nil { + return 0, false + } + return *m.value.LastFinalizedVersion, true +} + +func (m *metadata) setLastFinalizedVersion(batch *grocksdb.WriteBatch, version uint64) { + m.Lock() + defer m.Unlock() + + if m.value.LastFinalizedVersion != nil && version <= *m.value.LastFinalizedVersion { + return + } + + if m.value.LastFinalizedVersion == nil { + m.value.EarliestVersion = version + } + + m.value.LastFinalizedVersion = &version + m.saveB(batch) +} + +func (m *metadata) getMultipartVersion() uint64 { + m.Lock() + defer m.Unlock() + + return m.value.MultipartVersion +} + +func (m *metadata) setMultipartVersion(db *grocksdb.DB, version uint64) error { + m.Lock() + defer m.Unlock() + + m.value.MultipartVersion = version + return m.save(db) +} + +func (m *metadata) save(db *grocksdb.DB) error { + return db.Put(defaultWriteOptions, metadataKeyFmt.Encode(), cbor.Marshal(m.value)) +} + +// TODO: Collaps with save. +func (m *metadata) saveB(batch *grocksdb.WriteBatch) { + batch.Put(metadataKeyFmt.Encode(), cbor.Marshal(m.value)) +} + +// updatedNode is an element of the root updated nodes key. +// +// NOTE: Public fields of this structure are part of the on-disk format. +type updatedNode struct { + _ struct{} `cbor:",toarray"` // nolint + + Removed bool + Hash hash.Hash +} + +// rootsMetadata manages the roots metadata for a given version. +// +// NOTE: Public fields of this structure are part of the on-disk format. +type rootsMetadata struct { + _ struct{} `cbor:",toarray"` + + // Roots is the map of a root created in a version to any derived roots (in this or later versions). + Roots map[node.TypedHash][]node.TypedHash + + // version is the version this metadata is for. + version uint64 +} + +// loadRootsMetadata loads the roots metadata for the given version from the database. +func loadRootsMetadata(db *grocksdb.DB, version uint64) (*rootsMetadata, error) { + rootsMeta := &rootsMetadata{version: version} + + s, err := db.Get(defaultReadOptions, rootsMetadataKeyFmt.Encode(version)) + if err != nil { + return nil, fmt.Errorf("mkvs/rocksdb: failed to get roots metadata from backing store: %w", err) + } + defer s.Free() + switch s.Exists() { + case false: + rootsMeta.Roots = make(map[node.TypedHash][]node.TypedHash) + case true: + if err = cbor.Unmarshal(s.Data(), &rootsMeta); err != nil { + return nil, fmt.Errorf("mkvs/rocksdb: failed to unmarshal roots metadata: %w", err) + } + } + return rootsMeta, nil +} + +// save saves the roots metadata to the database. +func (rm *rootsMetadata) save(batch *grocksdb.WriteBatch) { + batch.Put(rootsMetadataKeyFmt.Encode(rm.version), cbor.Marshal(rm)) +} diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go new file mode 100644 index 00000000000..8f105bdd5f2 --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -0,0 +1,1025 @@ +// Package badger provides a RocksDB-backed node database. +package rocksdb + +import ( + "context" + "encoding/binary" + "fmt" + "runtime" + "sync" + + "github.com/linxGnu/grocksdb" + + "github.com/oasisprotocol/oasis-core/go/common" + "github.com/oasisprotocol/oasis-core/go/common/cbor" + "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" + "github.com/oasisprotocol/oasis-core/go/common/keyformat" + "github.com/oasisprotocol/oasis-core/go/common/logging" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/writelog" +) + +const ( + dbVersion = 1 + // multipartVersionNone is the value used for the multipart version in metadata + // when no multipart restore is in progress. + multipartVersionNone uint64 = 0 +) + +// Metadata CF keys (not timestamped). +var ( + // rootsMetadataKeyFmt is the key format for roots metadata. The key format is (version). + // + // Value is CBOR-serialized rootsMetadata. + rootsMetadataKeyFmt = keyformat.New(0x02, uint64(0)) + + // rootUpdatedNodesKeyFmt is the key format for the pending updated nodes for the + // given root that need to be removed only in case the given root is not among + // the finalized roots. They key format is (version, root). + // + // Value is CBOR-serialized []updatedNode. + rootUpdatedNodesKeyFmt = keyformat.New(0x03, uint64(0), &node.TypedHash{}) + + // metadataKeyFmt is the key format for metadata. + // + // Value is CBOR-serialized metadata. + metadataKeyFmt = keyformat.New(0x04) + + // multipartRestoreNodeLogKeyFmt is the key format for the nodes inserted during a chunk restore. + // Once a set of chunks is fully restored, these entries should be removed. If chunk restoration + // is interrupted for any reason, the nodes associated with these keys should be removed, along + // with these entries. + // + // Value is empty. + multipartRestoreNodeLogKeyFmt = keyformat.New(0x05, &node.TypedHash{}) +) + +// Node CF keys (timestamped). +var ( + // nodeKeyFmt is the key format for nodes (node hash). + // + // Value is serialized node. + nodeKeyFmt = keyformat.New(0x00, &hash.Hash{}) + + // TODO: separate CF? + // writeLogKeyFmt is the key format for write logs (version, new root, + // old root). + // + // Value is CBOR-serialized write log. + writeLogKeyFmt = keyformat.New(0x01, uint64(0), &node.TypedHash{}, &node.TypedHash{}) + + // rootNodeKeyFmt is the key format for root nodes (node hash). + // + // Value is empty. + rootNodeKeyFmt = keyformat.New(0x06, &node.TypedHash{}) +) + +var ( + defaultWriteOptions = grocksdb.NewDefaultWriteOptions() + defaultReadOptions = grocksdb.NewDefaultReadOptions() +) + +const ( + cfMetadataName = "default" + cfNodeTree = "node" + // cfStateTreeName = "state_tree" + // cfIOTreeName = "io_tree" +) + +// New creates a new RocksDB-backed node database. +func New(cfg *api.Config) (api.NodeDB, error) { + db := &rocksdbNodeDB{ + logger: logging.GetLogger("mkvs/db/rocksdb"), + namespace: cfg.Namespace, + discardWriteLogs: cfg.DiscardWriteLogs, + readOnly: cfg.ReadOnly, + } + + // XXX: Most of these options were taken from cosmos SDK. + // Experiment/modify if needed. Most of these can be adjusted + // on a live database. + + // Create options for the metadata column family. + optsMeta := grocksdb.NewDefaultOptions() + optsMeta.SetCreateIfMissing(true) + optsMeta.SetCreateIfMissingColumnFamilies(true) + + // Create options for the node column families. + // TODO: Consider separate options for state vs. io. + optsNodes := grocksdb.NewDefaultOptions() + optsNodes.SetCreateIfMissing(true) + + optsNodes.SetComparator(createTimestampComparator()) + optsNodes.IncreaseParallelism(runtime.NumCPU()) + optsNodes.OptimizeLevelStyleCompaction(512 * 1024 * 1024) + optsNodes.SetTargetFileSizeMultiplier(2) + optsNodes.SetLevelCompactionDynamicLevelBytes(true) + + bbto := grocksdb.NewDefaultBlockBasedTableOptions() + bbto.SetBlockSize(32 * 1024) + if cfg.MaxCacheSize == 0 { + // Default to 64mb block cache size if not configured. + bbto.SetBlockCache(grocksdb.NewLRUCache(64 * 1024 * 1024)) + } else { + bbto.SetBlockCache(grocksdb.NewLRUCache(uint64(cfg.MaxCacheSize))) + } + bbto.SetFilterPolicy(grocksdb.NewRibbonHybridFilterPolicy(9.9, 1)) + bbto.SetIndexType(grocksdb.KBinarySearchWithFirstKey) + optsNodes.SetBlockBasedTableFactory(bbto) + optsNodes.SetCompressionOptionsParallelThreads(4) + + /* + // Apparently with dict compression the file writer doesn't report file size: + // https://github.com/facebook/rocksdb/issues/11146 + // compression options at bottommost level + opts.SetBottommostCompression(grocksdb.ZSTDCompression) + + compressOpts := grocksdb.NewDefaultCompressionOptions() + compressOpts.MaxDictBytes = 112640 // 110k + compressOpts.Level = 12 + + opts.SetBottommostCompressionOptions(compressOpts, true) + opts.SetBottommostCompressionOptionsZstdMaxTrainBytes(compressOpts.MaxDictBytes*100, true) + + */ + + var err error + var cfHandles []*grocksdb.ColumnFamilyHandle + switch cfg.ReadOnly { + case true: + db.db, cfHandles, err = grocksdb.OpenDbForReadOnlyColumnFamilies( + optsMeta, + cfg.DB, + []string{ + cfMetadataName, + cfNodeTree, + // cfStateTreeName, + // cfIOTreeName, + }, + []*grocksdb.Options{ + optsMeta, + optsNodes, + // optsNodes, + }, + false) + case false: + db.db, cfHandles, err = grocksdb.OpenDbColumnFamilies( + optsMeta, + cfg.DB, + []string{ + cfMetadataName, + cfNodeTree, + // cfStateTreeName, + // cfIOTreeName, + }, + []*grocksdb.Options{ + optsMeta, + optsNodes, + // optsNodes, + }, + ) + } + if err != nil { + return nil, fmt.Errorf("mkvs/rocksdb: failed to open database: %w", err) + } + db.cfMetadata = cfHandles[0] // Also the default handle. + db.cfNode = cfHandles[1] + // db.cfStateTree = cfHandles[1] + // db.cfIOTree = cfHandles[2] + + // Load database metadata. + if err = db.load(); err != nil { + db.db.Close() + return nil, fmt.Errorf("mkvs/rocksdb: failed to load metadata: %w", err) + } + + // Cleanup any multipart restore remnants, since they can't be used anymore. + if err = db.cleanMultipartLocked(true); err != nil { + db.db.Close() + return nil, fmt.Errorf("mkvs/rocksdb: failed to clean leftovers from multipart restore: %w", err) + } + + return db, nil +} + +type rocksdbNodeDB struct { + logger *logging.Logger + readOnly bool + + namespace common.Namespace + + // metaUpdateLock must be held at any point where data at tsMetadata is read and updated. This + // is required because all metadata updates happen at the same timestamp and as such conflicts + // cannot be detected. + metaUpdateLock sync.Mutex + meta metadata + multipartVersion uint64 + + discardWriteLogs bool + + db *grocksdb.DB + cfMetadata *grocksdb.ColumnFamilyHandle + cfNode *grocksdb.ColumnFamilyHandle + // cfStateTree *grocksdb.ColumnFamilyHandle + // cfIOTree *grocksdb.ColumnFamilyHandle + + closeOnce sync.Once +} + +/* +func (d *rocksdbNodeDB) getColumnFamilyForRoot(root node.Root) *grocksdb.ColumnFamilyHandle { + switch root.Type { + case node.RootTypeState: + return d.cfStateTree + case node.RootTypeIO: + return d.cfIOTree + default: + panic(fmt.Errorf("unsupported root type: %s", root.Type)) + } +} + + +func (d *rocksdbNodeDB) getColumnFamilyForType(rootType node.RootType) *grocksdb.ColumnFamilyHandle { + switch rootType { + case node.RootTypeState: + return d.cfStateTree + case node.RootTypeIO: + return d.cfIOTree + default: + panic(fmt.Errorf("unsupported root type: %s", rootType)) + } +} +*/ + +func (d *rocksdbNodeDB) load() error { + /* + // Check first if the database is even usable. + _, err := d.db.Get(migrationMetaKeyFm.Encode()) + if err == nil { + return api.ErrUpgradeInProgress + } + */ + + // Load metadata. + item, err := d.db.Get(defaultReadOptions, metadataKeyFmt.Encode()) + switch err { + case nil: + if !item.Exists() { + break + } + defer item.Free() + + // Metadata already exists, just load it and verify that it is + // compatible with what we have here. + if err := cbor.UnmarshalTrusted(item.Data(), &d.meta.value); err != nil { + return err + } + + if d.meta.value.Version != dbVersion { + return fmt.Errorf("incompatible database version (expected: %d got: %d)", + dbVersion, + d.meta.value.Version, + ) + } + if !d.meta.value.Namespace.Equal(&d.namespace) { + return fmt.Errorf("incompatible namespace (expected: %s got: %s)", + d.namespace, + d.meta.value.Namespace, + ) + } + return nil + default: + return err + } + + // No metadata exists, create some. + d.meta.value.Version = dbVersion + d.meta.value.Namespace = d.namespace + if err = d.meta.save(d.db); err != nil { + return err + } + + return nil +} + +func (d *rocksdbNodeDB) sanityCheckNamespace(ns common.Namespace) error { + if !ns.Equal(&d.namespace) { + return api.ErrBadNamespace + } + return nil +} + +func (d *rocksdbNodeDB) checkRoot(root node.Root) error { + rootHash := node.TypedHashFromRoot(root) + + s, err := d.db.GetCF(timestampReadOptions(root.Version), d.cfNode, rootNodeKeyFmt.Encode(&rootHash)) + if err != nil { + d.logger.Error("failed to check root existence", + "err", err, + ) + return fmt.Errorf("mkvs/rocksdb: failed to get root from backing store: %w", err) + } + defer s.Free() + if !s.Exists() { + return api.ErrRootNotFound + } + return nil +} + +// Implements api.NodeDB. +func (d *rocksdbNodeDB) GetNode(root node.Root, ptr *node.Pointer) (node.Node, error) { + if ptr == nil || !ptr.IsClean() { + panic("mkvs/rocksdb: attempted to get invalid pointer from node database") + } + if err := d.sanityCheckNamespace(root.Namespace); err != nil { + return nil, err + } + + // If the version is earlier than the earliest version, we don't have the node (it was pruned). + // Note that the key can still be present in the database until it gets compacted. + if root.Version < d.meta.getEarliestVersion() { + return nil, api.ErrNodeNotFound + } + + // Check if the root actually exists. + if err := d.checkRoot(root); err != nil { + return nil, err + } + + // cf := d.getColumnFamilyForRoot(root) + s, err := d.db.GetCF(timestampReadOptions(root.Version), d.cfNode, nodeKeyFmt.Encode(&ptr.Hash)) + if err != nil { + return nil, fmt.Errorf("mkvs/rocksdb: failed to get node from backing store: %w", err) + } + defer s.Free() + if !s.Exists() { + fmt.Println("fail here", root.Version) + return nil, api.ErrNodeNotFound + } + + var n node.Node + n, err = node.UnmarshalBinary(s.Data()) + if err != nil { + return nil, fmt.Errorf("mkvs/rocksdb: failed to unmarshal node: %w", err) + } + + return n, nil +} + +func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node.Root) (writelog.Iterator, error) { + if d.discardWriteLogs { + return nil, api.ErrWriteLogNotFound + } + if !endRoot.Follows(&startRoot) { + return nil, api.ErrRootMustFollowOld + } + if err := d.sanityCheckNamespace(startRoot.Namespace); err != nil { + return nil, err + } + // If the version is earlier than the earliest version, we don't have the roots. + if endRoot.Version < d.meta.getEarliestVersion() { + return nil, api.ErrWriteLogNotFound + } + + // Check if the root actually exists. + if err := d.checkRoot(endRoot); err != nil { + return nil, err + } + + // Start at the end root and search towards the start root. This assumes that the + // chains are not long and that there is not a lot of forks as in that case performance + // would suffer. + // + // In reality the two common cases are: + // - State updates: s -> s' (a single hop) + // - I/O updates: empty -> i -> io (two hops) + // + // For this reason, we currently refuse to traverse more than two hops. + const maxAllowedHops = 2 + + type wlItem struct { + depth uint8 + endRootHash node.TypedHash + logKeys [][]byte + logRoots []node.TypedHash + } + // NOTE: We could use a proper deque, but as long as we keep the number of hops and + // forks low, this should not be a problem. + queue := []*wlItem{{depth: 0, endRootHash: node.TypedHashFromRoot(endRoot)}} + startRootHash := node.TypedHashFromRoot(startRoot) + // cf := d.getColumnFamilyForType(startRootHash.Type()) + for len(queue) > 0 { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + curItem := queue[0] + queue = queue[1:] + + wl, err := func() (writelog.Iterator, error) { + // Iterate over all write logs that result in the current item. + start := writeLogKeyFmt.Encode(endRoot.Version, &curItem.endRootHash) + + // TODO: maybe support prefix iterator? (manually configure start & end). + it := newIterator(d.db.NewIteratorCF(timestampReadOptions(endRoot.Version), d.cfNode), start, nil, false) + defer it.Close() + + for ; it.Valid(); it.Next() { + + if ctx.Err() != nil { + return nil, ctx.Err() + } + + key := it.Key() + + var decVersion uint64 + var decEndRootHash node.TypedHash + var decStartRootHash node.TypedHash + + if !writeLogKeyFmt.Decode(key, &decVersion, &decEndRootHash, &decStartRootHash) { + return nil, nil + } + // TODO: check other such places. + if decVersion != endRoot.Version || !decEndRootHash.Equal(&curItem.endRootHash) { + return nil, nil + } + + nextItem := wlItem{ + depth: curItem.depth + 1, + endRootHash: decStartRootHash, + // Only store log keys to avoid keeping everything in memory while + // we are searching for the right path. + logKeys: append(curItem.logKeys, key), + logRoots: append(curItem.logRoots, curItem.endRootHash), + } + if nextItem.endRootHash.Equal(&startRootHash) { + // Path has been found, deserialize and stream write logs. + var index int + return api.ReviveHashedDBWriteLogs(ctx, + func() (node.Root, api.HashedDBWriteLog, error) { + if index >= len(nextItem.logKeys) { + return node.Root{}, nil, nil + } + + key := nextItem.logKeys[index] + root := node.Root{ + Namespace: endRoot.Namespace, + Version: endRoot.Version, + Type: nextItem.logRoots[index].Type(), + Hash: nextItem.logRoots[index].Hash(), + } + + item, err := d.db.GetCF(timestampReadOptions(endRoot.Version), d.cfNode, key) + if err != nil || !item.Exists() { + return node.Root{}, nil, err + } + defer item.Free() + + var log api.HashedDBWriteLog + if err := cbor.UnmarshalTrusted(item.Data(), &log); err != nil { + return node.Root{}, nil, err + } + + index++ + return root, log, nil + }, + func(root node.Root, h hash.Hash) (*node.LeafNode, error) { + leaf, err := d.GetNode(root, &node.Pointer{Hash: h, Clean: true}) + if err != nil { + return nil, err + } + return leaf.(*node.LeafNode), nil + }, + func() { + }, + ) + } + + if nextItem.depth < maxAllowedHops { + queue = append(queue, &nextItem) + } + } + + return nil, nil + }() + if wl != nil || err != nil { + return wl, err + } + } + + return nil, api.ErrWriteLogNotFound +} + +func (d *rocksdbNodeDB) GetLatestVersion() (uint64, bool) { + return d.meta.getLastFinalizedVersion() +} + +func (d *rocksdbNodeDB) GetEarliestVersion() uint64 { + return d.meta.getEarliestVersion() +} + +func (d *rocksdbNodeDB) GetRootsForVersion(version uint64) ([]node.Root, error) { + // If the version is earlier than the earliest version, we don't have the roots. + if version < d.meta.getEarliestVersion() { + return nil, nil + } + + rootsMeta, err := loadRootsMetadata(d.db, version) + if err != nil { + return nil, err + } + + roots := make([]node.Root, 0, len(rootsMeta.Roots)) + for rootHash := range rootsMeta.Roots { + roots = append(roots, node.Root{ + Namespace: d.namespace, + Version: version, + Type: rootHash.Type(), + Hash: rootHash.Hash(), + }) + } + return roots, nil +} + +func (d *rocksdbNodeDB) HasRoot(root node.Root) bool { + if err := d.sanityCheckNamespace(root.Namespace); err != nil { + return false + } + + // An empty root is always implicitly present. + if root.Hash.IsEmpty() { + return true + } + + // If the version is earlier than the earliest version, we don't have the root. + if root.Version < d.meta.getEarliestVersion() { + return false + } + + rootsMeta, err := loadRootsMetadata(d.db, root.Version) + if err != nil { + panic(err) + } + + _, exists := rootsMeta.Roots[node.TypedHashFromRoot(root)] + return exists +} + +func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { + if len(roots) == 0 { + return fmt.Errorf("mkvs/badger: need at least one root to finalize") + } + version := roots[0].Version + + d.metaUpdateLock.Lock() + defer d.metaUpdateLock.Unlock() + + if d.multipartVersion != multipartVersionNone && d.multipartVersion != version { + return api.ErrInvalidMultipartVersion + } + + // Make sure that the previous version has been finalized (if we are not restoring). + lastFinalizedVersion, exists := d.meta.getLastFinalizedVersion() + if d.multipartVersion == multipartVersionNone && version > 0 && exists && lastFinalizedVersion < (version-1) { + return api.ErrNotFinalized + } + // Make sure that this version has not yet been finalized. + if exists && version <= lastFinalizedVersion { + return api.ErrAlreadyFinalized + } + + // Determine the set of finalized roots. Finalization is transitive, so if + // a parent root is finalized the child should be considered finalized too. + finalizedRoots := make(map[node.TypedHash]bool) + for _, root := range roots { + if root.Version != version { + return fmt.Errorf("mkvs/badger: roots to finalize don't have matching versions") + } + finalizedRoots[node.TypedHashFromRoot(root)] = true + } + var rootsChanged bool + rootsMeta, err := loadRootsMetadata(d.db, version) + if err != nil { + return err + } + for updated := true; updated; { + updated = false + + for rootHash, derivedRoots := range rootsMeta.Roots { + if len(derivedRoots) == 0 { + continue + } + + for _, nextRoot := range derivedRoots { + if !finalizedRoots[rootHash] && finalizedRoots[nextRoot] { + finalizedRoots[rootHash] = true + updated = true + } + } + } + } + + // Sanity check the input roots list. + for iroot := range finalizedRoots { + h := iroot.Hash() + if _, ok := rootsMeta.Roots[iroot]; !ok && !h.IsEmpty() { + return api.ErrRootNotFound + } + } + + batch := grocksdb.NewWriteBatch() + defer batch.Destroy() + ts := timestampFromVersion(version) + + // Go through all roots and prune them based on whether they are finalized or not. + maybeLoneNodes := make(map[hash.Hash]bool) + notLoneNodes := make(map[hash.Hash]bool) + + for rootHash := range rootsMeta.Roots { + // TODO: Consider colocating updated nodes with the root metadata. + rootUpdatedNodesKey := rootUpdatedNodesKeyFmt.Encode(version, &rootHash) + + // Load hashes of nodes added during this version for this root. + item, err := d.db.Get(defaultReadOptions, rootUpdatedNodesKey) + if err != nil { + panic(fmt.Errorf("mkvs/rocksdb: corrupted root updated nodes index: %w", err)) + } + if !item.Exists() { + panic(fmt.Errorf("mkvs/rocksdb: missing root updated nodes index")) + } + + var updatedNodes []updatedNode + if err := cbor.UnmarshalTrusted(item.Data(), &updatedNodes); err != nil { + panic(fmt.Errorf("mkvs/badger: corrupted root updated nodes index: %w", err)) + } + item.Free() // TODO: wrapper. + + if finalizedRoots[rootHash] { + // Make sure not to remove any nodes shared with finalized roots. + for _, n := range updatedNodes { + if n.Removed { + maybeLoneNodes[n.Hash] = true + } else { + notLoneNodes[n.Hash] = true + } + } + } else { + // Remove any non-finalized roots. It is safe to remove these nodes as Badger's version + // control will make sure they are not removed if they are resurrected in any later + // version as long as we make sure that these nodes are not shared with any finalized + // roots added in the same version. + for _, n := range updatedNodes { + if !n.Removed { + maybeLoneNodes[n.Hash] = true + } + } + + delete(rootsMeta.Roots, rootHash) + rootsChanged = true + + // Remove write logs for the non-finalized root. + if !d.discardWriteLogs { + fmt.Println("DISCARDING HERE") + if err = func() error { + rootWriteLogsPrefix := writeLogKeyFmt.Encode(version, &rootHash) + fmt.Println("Prefix: ", version, rootWriteLogsPrefix) + wit := newIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), rootWriteLogsPrefix, nil, false) + defer wit.Close() + + fmt.Println("AA", wit.Valid()) + // cf := d.getColumnFamilyForType(rootHash.Type()) + for ; wit.Valid(); wit.Next() { + fmt.Println("COME HERE") + key := wit.Key() + + var decVersion uint64 + var decRootHash node.TypedHash + var decRootHash2 node.TypedHash + if !writeLogKeyFmt.Decode(key, &decVersion, &decRootHash, &decRootHash2) { + return nil + } + if decVersion != version || !decRootHash.Equal(&rootHash) { + return nil + } + + fmt.Println("DELETING HERE", ts, d.cfNode) + batch.DeleteCFWithTS(d.cfNode, key, ts[:]) + } + return nil + }(); err != nil { + return err + } + } + } + + // Set of updated nodes no longer needed after finalization. + batch.Delete(rootUpdatedNodesKey) + } + + // Clean any lone nodes. + for h := range maybeLoneNodes { + if notLoneNodes[h] { + continue + } + + // TODO: get CF for hash? + // batch.DeleteCFWithTS(d.cfIOTree, nodeKeyFmt.Encode(&h), ts[:]) + // batch.DeleteCFWithTS(d.cfStateTree, nodeKeyFmt.Encode(&h), ts[:]) + batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts[:]) + } + + // Save roots metadata if changed. + if rootsChanged { + rootsMeta.save(batch) + } + + // Update last finalized version. + d.meta.setLastFinalizedVersion(batch, version) + + // Commit batch. + if err := d.db.Write(defaultWriteOptions, batch); err != nil { + return fmt.Errorf("mkvs/badger: failed to commit finalized roots: %w", err) + } + + // Clean multipart metadata if there is any. + if d.multipartVersion != multipartVersionNone { + if err := d.cleanMultipartLocked(false); err != nil { + return err + } + } + return nil +} + +func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { + ts := timestampFromVersion(version) + + d.metaUpdateLock.Lock() + defer d.metaUpdateLock.Unlock() + + if d.multipartVersion != multipartVersionNone { + return api.ErrMultipartInProgress + } + + // Make sure that the version that we try to prune has been finalized. + lastFinalizedVersion, exists := d.meta.getLastFinalizedVersion() + if !exists || lastFinalizedVersion < version { + return api.ErrNotFinalized + } + // Make sure that the version that we are trying to prune is the earliest version. + if version != d.meta.getEarliestVersion() { + return api.ErrNotEarliest + } + + rootsMeta, err := loadRootsMetadata(d.db, version) + if err != nil { + return err + } + + batch := grocksdb.NewWriteBatch() + for rootHash, derivedRoots := range rootsMeta.Roots { + if len(derivedRoots) > 0 { + // Not a lone root. + continue + } + + // Traverse the root and prune all items created in this version. + root := node.Root{ + Namespace: d.namespace, + Version: version, + Type: rootHash.Type(), + Hash: rootHash.Hash(), + } + var innerErr error + err := api.Visit(ctx, d, root, func(ctx context.Context, n node.Node) bool { + h := n.GetHash() + + s, ts, err := d.db.GetCFWithTS(timestampReadOptions(root.Version), d.cfNode, nodeKeyFmt.Encode(&h)) + if err != nil { + return false + } + defer s.Free() + if !s.Exists() { + return false + } + + // TODO: Extract. + itemTS := binary.LittleEndian.Uint64(ts.Data()) + defer ts.Free() + + if itemTS == version { + batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts.Data()) + } + return true + }) + if innerErr != nil { + return innerErr + } + if err != nil { + return err + } + + batch.Delete(rootNodeKeyFmt.Encode(&rootHash)) + } + + // Prune all write logs in version. + if !d.discardWriteLogs { + prefix := writeLogKeyFmt.Encode(version) + wit := newIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), prefix, nil, false) + defer wit.Close() + + for ; wit.Valid(); wit.Next() { + key := wit.Key() + + var decVersion uint64 + var decRootHash node.TypedHash + var decRootHash2 node.TypedHash + if !writeLogKeyFmt.Decode(key, &decVersion, &decRootHash, &decRootHash2) { + break + } + if decVersion != version { + break + } + + batch.DeleteCFWithTS(d.cfNode, key, ts[:]) // HM? seems wrong. + } + + } + + // Update metadata. + d.meta.setEarliestVersion(batch, version+1) + + if err := d.db.Write(defaultWriteOptions, batch); err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to prune version %d: %w", version, err) + } + + // if err := d.db.IncreaseFullHistoryTsLow(d.cfIOTree, ts[:]); err != nil { + // return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from IO tree: %w", version, err) + // } + // if err := d.db.IncreaseFullHistoryTsLow(d.cfStateTree, ts[:]); err != nil { + // return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from state tree: %w", version, err) + // } + if err := d.db.IncreaseFullHistoryTsLow(d.cfNode, ts[:]); err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from nodes tree: %w", version, err) + } + return nil +} + +func (d *rocksdbNodeDB) StartMultipartInsert(version uint64) error { + d.metaUpdateLock.Lock() + defer d.metaUpdateLock.Unlock() + + if version == multipartVersionNone { + return api.ErrInvalidMultipartVersion + } + + if d.multipartVersion != multipartVersionNone { + if d.multipartVersion != version { + return api.ErrMultipartInProgress + } + // Multipart already initialized at the same version, so this was + // probably called e.g. as part of a further checkpoint restore. + return nil + } + + if err := d.meta.setMultipartVersion(d.db, version); err != nil { + return err + } + d.multipartVersion = version + + return nil +} + +func (d *rocksdbNodeDB) AbortMultipartInsert() error { + d.metaUpdateLock.Lock() + defer d.metaUpdateLock.Unlock() + + return d.cleanMultipartLocked(true) +} + +// Assumes metaUpdateLock is held when called. +func (d *rocksdbNodeDB) cleanMultipartLocked(removeNodes bool) error { + var version uint64 + + if d.multipartVersion != multipartVersionNone { + version = d.multipartVersion + } else { + version = d.meta.getMultipartVersion() + } + if version == multipartVersionNone { + // No multipart in progress, but it's not an error to call in a situation like this. + return nil + } + + it := newIterator(d.db.NewIterator(defaultReadOptions), multipartRestoreNodeLogKeyFmt.Encode(), nil, false) + defer it.Close() + + batch := grocksdb.NewWriteBatch() + defer batch.Destroy() + ts := timestampFromVersion(version) + var logged bool + for ; it.Valid(); it.Next() { + key := it.Key() + + var hash node.TypedHash + if !multipartRestoreNodeLogKeyFmt.Decode(key, &hash) { + break + } + + if removeNodes { + if !logged { + d.logger.Info("removing some nodes from a multipart restore") + logged = true + } + switch hash.Type() { + case node.RootTypeInvalid: + fmt.Println("REMOVING NODE at TS", ts) + h := hash.Hash() + batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts[:]) + default: + fmt.Println("REMOVING ROOT NODE at TS", ts, hash.Hash()) + // cf := d.getColumnFamilyForType(hash.Type()) + batch.DeleteCFWithTS(d.cfNode, rootNodeKeyFmt.Encode(&hash), ts[:]) + } + } + // Delete the metadata entry as well. + batch.Delete(key) + } + + // Apply the batch first. If anything fails, having corrupt + // multipart info in d.meta shouldn't hurt us next run. + if err := d.db.Write(defaultWriteOptions, batch); err != nil { + return err + } + + if err := d.meta.setMultipartVersion(d.db, multipartVersionNone); err != nil { + return err + } + + d.multipartVersion = multipartVersionNone + return nil +} + +func (d *rocksdbNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) (api.Batch, error) { + // WARNING: There is a maximum batch size and maximum batch entry count. + // Both of these things are derived from the MaxTableSize option. + // + // The size limit also applies to normal transactions, so the "right" + // thing to do would be to either crank up MaxTableSize or maybe split + // the transaction out. + + if d.readOnly { + return nil, api.ErrReadOnly + } + + d.metaUpdateLock.Lock() + defer d.metaUpdateLock.Unlock() + + if d.multipartVersion != multipartVersionNone && d.multipartVersion != version { + return nil, api.ErrInvalidMultipartVersion + } + if chunk != (d.multipartVersion != multipartVersionNone) { + return nil, api.ErrMultipartInProgress + } + + var logBatch *grocksdb.WriteBatch + if d.multipartVersion != multipartVersionNone { + // The node log is at a different version than the nodes themselves, + // which is awkward. + logBatch = grocksdb.NewWriteBatch() + } + + return &rocksdbBatch{ + db: d, + version: version, + bat: grocksdb.NewWriteBatch(), + multipartNodes: logBatch, + oldRoot: oldRoot, + chunk: chunk, + }, nil +} + +func (d *rocksdbNodeDB) Size() (uint64, error) { + meta := d.db.GetColumnFamilyMetadataCF(d.cfMetadata) + // io := d.db.GetColumnFamilyMetadataCF(d.cfIOTree) + // state := d.db.GetColumnFamilyMetadataCF(d.cfStateTree) + node := d.db.GetColumnFamilyMetadataCF(d.cfNode) + + return meta.Size() + node.Size(), nil // io.Size() + state.Size(), nil +} + +func (d *rocksdbNodeDB) Sync() error { + opts := grocksdb.NewDefaultFlushOptions() + return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfNode}, opts) +} + +func (d *rocksdbNodeDB) Close() { + d.closeOnce.Do(func() { + d.db.Close() + d.cfMetadata = nil + // d.cfIOTree = nil + // d.cfStateTree = nil + d.cfNode = nil + d.db = nil + }) +} diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go new file mode 100644 index 00000000000..d9f2585d74f --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -0,0 +1,299 @@ +package rocksdb + +import ( + "bytes" + "context" + "fmt" + "os" + "strconv" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/oasisprotocol/oasis-core/go/common" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/checkpoint" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/writelog" +) + +var ( + nodePrefix = nodeKeyFmt.Encode() + + logPrefix = multipartRestoreNodeLogKeyFmt.Encode() + + testNs = common.NewTestNamespaceFromSeed([]byte("rocksdb node db test ns"), 0) + + dbCfg = &api.Config{ + Namespace: testNs, + MaxCacheSize: 16 * 1024 * 1024, + NoFsync: true, + MemoryOnly: true, + } + + testValues = [][]byte{ + []byte("colorless green ideas sleep furiously"), + []byte("excepting understandable chairs piously"), + []byte("at the prickle for rainbow hoovering"), + } +) + +type keySet map[string]struct{} + +type test struct { + require *require.Assertions + ctx context.Context + dir string + rocksdb *rocksdbNodeDB + ckMeta *checkpoint.Metadata + ckNodes keySet +} + +func fillDB( + ctx context.Context, + require *require.Assertions, + values [][]byte, + prevRoot *node.Root, + version, commitVersion uint64, + ndb api.NodeDB, +) node.Root { + if prevRoot == nil { + emptyRoot := node.Root{ + Namespace: testNs, + Version: version, + Type: node.RootTypeState, + } + emptyRoot.Hash.Empty() + prevRoot = &emptyRoot + } + + tree := mkvs.NewWithRoot(nil, ndb, *prevRoot) + require.NotNil(tree, "NewWithRoot()") + + var wl writelog.WriteLog + for i, val := range values { + wl = append(wl, writelog.LogEntry{Key: []byte(strconv.Itoa(i)), Value: val}) + } + + err := tree.ApplyWriteLog(ctx, writelog.NewStaticIterator(wl)) + require.NoError(err, "ApplyWriteLog()") + + _, hash, err := tree.Commit(ctx, testNs, commitVersion) + require.NoError(err, "Commit()") + + return node.Root{ + Namespace: testNs, + Version: version + 1, + Type: node.RootTypeState, + Hash: hash, + } +} + +func createCheckpoint(ctx context.Context, require *require.Assertions, dir string, values [][]byte, version uint64) (*checkpoint.Metadata, keySet) { + dbDir, err := os.MkdirTemp(dir, "checkpoint-db") + require.NoError(err, "TempDir()") + dbCfg := *dbCfg + dbCfg.DB = dbDir + ndb, err := New(&dbCfg) + require.NoError(err, "New()") + defer ndb.Close() + rocksdb := ndb.(*rocksdbNodeDB) + fc, err := checkpoint.NewFileCreator(dir, ndb) + require.NoError(err, "NewFileCreator()") + + ckRoot := fillDB(ctx, require, values, nil, version, 2, ndb) + ckMeta, err := fc.CreateCheckpoint(ctx, ckRoot, 1024*1024) + require.NoError(err, "CreateCheckpoint()") + + nodeKeys := keySet{} + it := newIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(2), rocksdb.cfNode), nil, nil, false) + defer it.Close() + for ; it.Valid(); it.Next() { + // TODO: maybe gotta strip version. + if len(values) == 1 { + fmt.Println("AAAAa", it.Key()) + } + + if bytes.HasPrefix(it.Key(), nodePrefix) { + nodeKeys[string(it.Key())] = struct{}{} + } + } + fmt.Println("node keys", len(values), version, len(nodeKeys)) + + return ckMeta, nodeKeys +} + +func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version uint64, keySet keySet) { + notVisited := map[string]struct{}{} + for k := range keySet { + notVisited[k] = struct{}{} + } + + it := newIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), rocksdb.cfNode), nil, nil, false) + defer it.Close() + for ; it.Valid(); it.Next() { + key := it.Key() + if !bytes.HasPrefix(key, nodePrefix) { + continue + } + _, ok := keySet[string(key)] + fmt.Println(version, key) + require.Equal(true, ok, "unexpected node in db") + delete(notVisited, string(key)) + } + require.Equal(0, len(notVisited), "some nodes not visited") +} + +func checkNoLogKeys(require *require.Assertions, rocksdb *rocksdbNodeDB) { + it := newIterator(rocksdb.db.NewIterator(defaultReadOptions), nil, nil, false) + defer it.Close() + for ; it.Valid(); it.Next() { + require.False(bytes.HasPrefix(it.Key(), logPrefix), "checkLogKeys()/iteration") + } +} + +func restoreCheckpoint(ctx *test, ckMeta *checkpoint.Metadata, ckNodes keySet) checkpoint.Restorer { + fc, err := checkpoint.NewFileCreator(ctx.dir, ctx.rocksdb) + ctx.require.NoError(err, "NewFileCreator() - 2") + + restorer, err := checkpoint.NewRestorer(ctx.rocksdb) + ctx.require.NoError(err, "NewRestorer()") + + err = ctx.rocksdb.StartMultipartInsert(ckMeta.Root.Version) + ctx.require.NoError(err, "StartMultipartInsert()") + err = restorer.StartRestore(ctx.ctx, ckMeta) + ctx.require.NoError(err, "StartRestore()") + for i := range ckMeta.Chunks { + idx := uint64(i) + chunkMeta, err := ckMeta.GetChunkMetadata(idx) + ctx.require.NoError(err, fmt.Sprintf("GetChunkMetadata(%d)", idx)) + func() { + r, w, err := os.Pipe() + ctx.require.NoError(err, "Pipe()") + errCh := make(chan error) + go func() { + _, errr := restorer.RestoreChunk(ctx.ctx, idx, r) + errCh <- errr + }() + err = fc.GetCheckpointChunk(ctx.ctx, chunkMeta, w) + w.Close() + errRestore := <-errCh + ctx.require.NoError(err, "GetCheckpointChunk()") + ctx.require.NoError(errRestore, "RestoreChunk()") + }() + } + + verifyNodes(ctx.require, ctx.rocksdb, ckMeta.Root.Version, ckNodes) + + return restorer +} + +func TestMultipartRestore(t *testing.T) { + ctx := context.Background() + wrap := func(testFunc func(ctx *test), initialValues [][]byte) func(*testing.T) { + return func(t *testing.T) { + require := require.New(t) + + dir, err := os.MkdirTemp("", "oasis-storage-database-test") + require.NoError(err, "TempDir()") + defer os.RemoveAll(dir) + + fmt.Println("Creating checkpoint with", initialValues) + ckMeta, ckNodes := createCheckpoint(ctx, require, dir, initialValues, 1) + fmt.Println("Got:", ckNodes) + + dbCfg := *dbCfg + dbCfg.DB = dir + ndb, err := New(&dbCfg) + require.NoError(err, "New() - 2") + defer ndb.Close() + rocksdb := ndb.(*rocksdbNodeDB) + + testCtx := &test{ + require: require, + ctx: ctx, + dir: dir, + rocksdb: rocksdb, + ckMeta: ckMeta, + ckNodes: ckNodes, + } + testFunc(testCtx) + } + } + + t.Run("Abort", wrap(testAbort, testValues)) + t.Run("Finalize", wrap(testFinalize, testValues)) + t.Run("ExistingNodes", wrap(testExistingNodes, testValues[:1])) +} + +func testAbort(ctx *test) { + // Abort a restore, check nodes again. + // There should be no leftover nodes, and the log keys should be gone too. + restorer := restoreCheckpoint(ctx, ctx.ckMeta, ctx.ckNodes) + err := restorer.AbortRestore(ctx.ctx) + ctx.require.NoError(err, "AbortRestore()") + err = ctx.rocksdb.AbortMultipartInsert() + ctx.require.NoError(err, "AbortMultipartInsert()") + + verifyNodes(ctx.require, ctx.rocksdb, 2, keySet{}) + checkNoLogKeys(ctx.require, ctx.rocksdb) +} + +func testFinalize(ctx *test) { + // Finalize a restore, check nodes again. + // This time, all the restored nodes should be present, but the + // log keys should be gone. + restoreCheckpoint(ctx, ctx.ckMeta, ctx.ckNodes) + + // Test parameter sanity checking first. + err := ctx.rocksdb.Finalize(nil) + ctx.require.Error(err, "Finalize with no roots should fail") + + bogusRoot := ctx.ckMeta.Root + bogusRoot.Version++ + err = ctx.rocksdb.Finalize([]node.Root{ctx.ckMeta.Root, bogusRoot}) + ctx.require.Error(err, "Finalize with roots from different versions should fail") + + err = ctx.rocksdb.Finalize([]node.Root{ctx.ckMeta.Root}) + ctx.require.NoError(err, "Finalize()") + + verifyNodes(ctx.require, ctx.rocksdb, ctx.ckMeta.Root.Version, ctx.ckNodes) + checkNoLogKeys(ctx.require, ctx.rocksdb) +} + +func testExistingNodes(ctx *test) { + // Create two checkpoints, so we have two sets of nodes. + // The first checkpoint will be the base for a fresh database and must include + // a node from the second checkpoint, which will be used for multipart restore. + // The pre-existing node should then not be deleted after aborting the second + // checkpoint. + + // Create the checkpoint to be used as the overriding restore. + ckMeta2, ckNodes2 := createCheckpoint(ctx.ctx, ctx.require, ctx.dir, testValues, 2) + fmt.Println(ctx.ckNodes) + var overlap bool + for node1 := range ctx.ckNodes { + fmt.Println("IM HERE", node1) + if _, ok := ckNodes2[node1]; ok { + overlap = true + break + } + } + ctx.require.Equal(true, overlap, "pointless test when no nodes would overlap") + + // Restore first checkpoint. The database is empty. + restoreCheckpoint(ctx, ctx.ckMeta, ctx.ckNodes) + err := ctx.rocksdb.Finalize([]node.Root{ctx.ckMeta.Root}) + ctx.require.NoError(err, "Finalize()") + verifyNodes(ctx.require, ctx.rocksdb, 2, ctx.ckNodes) + + // Restore the second checkpoint. One of the nodes from it already exists. After aborting, + // exactly the nodes from the first checkpoint should remain. + restorer := restoreCheckpoint(ctx, ckMeta2, ckNodes2) + err = restorer.AbortRestore(ctx.ctx) + ctx.require.NoError(err, "AbortRestore()") + err = ctx.rocksdb.AbortMultipartInsert() + ctx.require.NoError(err, "AbortMultipartInsert()") + verifyNodes(ctx.require, ctx.rocksdb, 2, ctx.ckNodes) +} diff --git a/go/storage/mkvs/db/rocksdb/timestamp.go b/go/storage/mkvs/db/rocksdb/timestamp.go new file mode 100644 index 00000000000..dbba60ad1c7 --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/timestamp.go @@ -0,0 +1,78 @@ +package rocksdb + +import ( + "bytes" + "encoding/binary" + + "github.com/linxGnu/grocksdb" +) + +// Versions (u64) are used as timestamps. +const timestampSize = 8 + +// createTimestampComparator is identical to the RocksDB builtin u64 timestamp comparator. +// https://github.com/facebook/rocksdb/blob/526f36b48381dd640a0426bd748dbc0bb5797c75/util/comparator.cc#L234-L307. +func createTimestampComparator() *grocksdb.Comparator { + return grocksdb.NewComparatorWithTimestamp( + // Use the builtin "leveldb.BytewiseComparator.u64ts" as name, + // so that the builtin tools `ldb`/`sst_dump` can work with the db. + "leveldb.BytewiseComparator.u64ts", + timestampSize, + compareTimestampKeys, + compareTimestamp, + compareWithoutTimestamp, + ) +} + +// gorocksdb.Comparing. +func compareTimestampKeys(a, b []byte) int { + // First compare keys without timestamps. + if ret := compareWithoutTimestamp(a, true, b, true); ret != 0 { + return ret + } + + // In case the key is the same, compare the timestamp (larger first). + return -compareTimestamp(a[len(a)-timestampSize:], b[len(b)-timestampSize:]) +} + +// gorocksdb.Comparing. +func compareTimestamp(a, b []byte) int { + ts1 := binary.LittleEndian.Uint64(a) + ts2 := binary.LittleEndian.Uint64(b) + + switch { + case ts1 < ts2: + return -1 + case ts1 > ts2: + return 1 + default: + return 0 + } +} + +// gorocksdb.ComparingWithoutTimestamp. +func compareWithoutTimestamp(a []byte, aHasTs bool, b []byte, bHasTs bool) int { + if aHasTs { + a = a[:len(a)-timestampSize] + } + if bHasTs { + b = b[:len(b)-timestampSize] + } + return bytes.Compare(a, b) +} + +func timestampFromVersion(version uint64) [timestampSize]byte { + var ts [timestampSize]byte + binary.LittleEndian.PutUint64(ts[:], version) + return ts +} + +// timestampReadOptions returns ReadOptions used in the RocksDB column family read. +func timestampReadOptions(version uint64) *grocksdb.ReadOptions { + ts := timestampFromVersion(version) + + readOpts := grocksdb.NewDefaultReadOptions() + readOpts.SetTimestamp(ts[:]) + + return readOpts +} diff --git a/go/storage/mkvs/db/badger/hash.go b/go/storage/mkvs/node/hash.go similarity index 58% rename from go/storage/mkvs/db/badger/hash.go rename to go/storage/mkvs/node/hash.go index b76b18bdc82..94debcf3be0 100644 --- a/go/storage/mkvs/db/badger/hash.go +++ b/go/storage/mkvs/node/hash.go @@ -1,4 +1,4 @@ -package badger +package node import ( "crypto/subtle" @@ -8,27 +8,26 @@ import ( "fmt" "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" - "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" ) var ( - _ encoding.BinaryMarshaler = (*typedHash)(nil) - _ encoding.BinaryUnmarshaler = (*typedHash)(nil) + _ encoding.BinaryMarshaler = (*TypedHash)(nil) + _ encoding.BinaryUnmarshaler = (*TypedHash)(nil) ) const typedHashSize = hash.Size + 1 -// typedHash is a node hash prefixed with its root type. -type typedHash [typedHashSize]byte +// TypedHash is a node hash prefixed with its root type. +type TypedHash [typedHashSize]byte // MarshalBinary encodes a typed hash into binary form. -func (h *typedHash) MarshalBinary() (data []byte, err error) { +func (h *TypedHash) MarshalBinary() (data []byte, err error) { data = append([]byte{}, h[:]...) return } // UnmarshalBinary decodes a binary marshaled hash. -func (h *typedHash) UnmarshalBinary(data []byte) error { +func (h *TypedHash) UnmarshalBinary(data []byte) error { if len(data) != typedHashSize { fmt.Printf("\nunexpected typedhash size: got %v, expected %v\n", len(data), typedHashSize) return hash.ErrMalformed @@ -40,12 +39,12 @@ func (h *typedHash) UnmarshalBinary(data []byte) error { } // MarshalText encodes a Hash into text form. -func (h typedHash) MarshalText() (data []byte, err error) { +func (h TypedHash) MarshalText() (data []byte, err error) { return []byte(base64.StdEncoding.EncodeToString(h[:])), nil } // UnmarshalText decodes a text marshaled Hash. -func (h *typedHash) UnmarshalText(text []byte) error { +func (h *TypedHash) UnmarshalText(text []byte) error { b, err := base64.StdEncoding.DecodeString(string(text)) if err != nil { return err @@ -55,7 +54,7 @@ func (h *typedHash) UnmarshalText(text []byte) error { } // UnmarshalHex deserializes a hexadecimal text string into the given type. -func (h *typedHash) UnmarshalHex(text string) error { +func (h *TypedHash) UnmarshalHex(text string) error { b, err := hex.DecodeString(text) if err != nil { return err @@ -65,7 +64,7 @@ func (h *typedHash) UnmarshalHex(text string) error { } // Equal compares vs another hash for equality. -func (h *typedHash) Equal(cmp *typedHash) bool { +func (h *TypedHash) Equal(cmp *TypedHash) bool { if cmp == nil { return false } @@ -73,36 +72,36 @@ func (h *typedHash) Equal(cmp *typedHash) bool { } // String returns the string representation of a typed hash. -func (h typedHash) String() string { - return fmt.Sprintf("%v:%s", node.RootType(h[0]), hex.EncodeToString(h[1:])) +func (h TypedHash) String() string { + return fmt.Sprintf("%v:%s", RootType(h[0]), hex.EncodeToString(h[1:])) } // FromParts returns the typed hash composed of the given type and hash. -func (h *typedHash) FromParts(typ node.RootType, hash hash.Hash) { +func (h *TypedHash) FromParts(typ RootType, hash hash.Hash) { h[0] = byte(typ) copy(h[1:], hash[:]) } // Type returns the storage type of the root corresponding to this typed hash. -func (h *typedHash) Type() node.RootType { - return node.RootType(h[0]) +func (h *TypedHash) Type() RootType { + return RootType(h[0]) } // Hash returns the hash portion of the typed hash. -func (h *typedHash) Hash() (rh hash.Hash) { +func (h *TypedHash) Hash() (rh hash.Hash) { copy(rh[:], h[1:]) return } -// typedHashFromParts creates a new typed hash with the parts given. -func typedHashFromParts(typ node.RootType, hash hash.Hash) (h typedHash) { +// TypedHashFromParts creates a new typed hash with the parts given. +func TypedHashFromParts(typ RootType, hash hash.Hash) (h TypedHash) { h[0] = byte(typ) copy(h[1:], hash[:]) return } -// typedHashFromRoot creates a new typed hash corresponding to the given storage root. -func typedHashFromRoot(root node.Root) (h typedHash) { +// TypedHashFromRoot creates a new typed hash corresponding to the given storage root. +func TypedHashFromRoot(root Root) (h TypedHash) { h[0] = byte(root.Type) copy(h[1:], root.Hash[:]) return diff --git a/go/storage/mkvs/tree_test.go b/go/storage/mkvs/tree_test.go index 05fed5a0078..ed18f13bc8d 100644 --- a/go/storage/mkvs/tree_test.go +++ b/go/storage/mkvs/tree_test.go @@ -17,6 +17,7 @@ import ( "github.com/oasisprotocol/oasis-core/go/common/crypto/hash" db "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" badgerDb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/badger" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/rocksdb" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/syncer" mkvsTests "github.com/oasisprotocol/oasis-core/go/storage/mkvs/tests" @@ -2332,6 +2333,30 @@ func TestBadgerBackend(t *testing.T) { }, nil) } +func TestRocksDBBackend(t *testing.T) { + testBackend(t, func(t *testing.T) (NodeDBFactory, func()) { + // Create a new random temporary directory under /tmp. + dir, err := os.MkdirTemp("", "mkvs.test.rocksdb") + require.NoError(t, err, "TempDir") + + // Create a RocksDB-backed Node DB factory. + factory := func(ns common.Namespace) (db.NodeDB, error) { + return rocksdb.New(&db.Config{ + DB: dir, + NoFsync: true, + Namespace: ns, + MaxCacheSize: 16 * 1024 * 1024, + }) + } + + cleanup := func() { + os.RemoveAll(dir) + } + + return factory, cleanup + }, nil) +} + func BenchmarkInsertCommitBatch1(b *testing.B) { benchmarkInsertBatch(b, 1, true) } From 498ba919cc13290a58f16da1fe384a1cee65b3af Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 08:57:02 +0100 Subject: [PATCH 02/28] simplify iterator --- go/storage/mkvs/db/badger/badger.go | 3 - go/storage/mkvs/db/rocksdb/batch.go | 1 - go/storage/mkvs/db/rocksdb/iterator.go | 75 ++++------------- go/storage/mkvs/db/rocksdb/rocksdb.go | 93 +++++++--------------- go/storage/mkvs/db/rocksdb/rocksdb_test.go | 17 +--- go/storage/mkvs/db/rocksdb/timestamp.go | 9 +++ 6 files changed, 56 insertions(+), 142 deletions(-) diff --git a/go/storage/mkvs/db/badger/badger.go b/go/storage/mkvs/db/badger/badger.go index 5ab2309bfc1..3823cfbc7c9 100644 --- a/go/storage/mkvs/db/badger/badger.go +++ b/go/storage/mkvs/db/badger/badger.go @@ -650,15 +650,12 @@ func (d *badgerNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo // Remove write logs for the non-finalized root. if !d.discardWriteLogs { - fmt.Println("DISCARDING HERE badger") if err = func() error { rootWriteLogsPrefix := writeLogKeyFmt.Encode(version, &rootHash) - fmt.Println("Prefix badger: ", version, rootWriteLogsPrefix) wit := tx.NewIterator(badger.IteratorOptions{Prefix: rootWriteLogsPrefix}) defer wit.Close() for wit.Rewind(); wit.Valid(); wit.Next() { - fmt.Println("DELETING HERE badger", version, wit.Item().Key()) if err = versionBatch.Delete(wit.Item().KeyCopy(nil)); err != nil { return err } diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index b5e5c640169..453a40ef0a1 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -64,7 +64,6 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { ts := timestampFromVersion(root.Version) ba.bat.PutCFWithTS(ba.db.cfNode, rootNodeKeyFmt.Encode(&rootHash), ts[:], []byte{}) if ba.multipartNodes != nil { - fmt.Println("putting there") ba.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&rootHash), []byte{}) } diff --git a/go/storage/mkvs/db/rocksdb/iterator.go b/go/storage/mkvs/db/rocksdb/iterator.go index 8083aef602b..76e675bee42 100644 --- a/go/storage/mkvs/db/rocksdb/iterator.go +++ b/go/storage/mkvs/db/rocksdb/iterator.go @@ -8,47 +8,21 @@ import ( ) type iterator struct { - source *grocksdb.Iterator - start, end []byte - reverse bool - invalid bool + source *grocksdb.Iterator + prefix []byte + invalid bool } -// TODO: add support for prefix, on valid, check if prefix matches. -func newIterator(source *grocksdb.Iterator, start, end []byte, reverse bool) *iterator { - switch reverse { - case false: - if start == nil { - source.SeekToFirst() - } else { - source.Seek(start) - } - case true: - if end == nil { - source.SeekToLast() - } else { - source.Seek(end) - - if source.Valid() { - // We are either at the matching key, or the next key. - eoaKey := readOnlySlice(source.Key()) - if bytes.Compare(end, eoaKey) <= 0 { // end == aoaKey, or end < eaoKey - // End is exclusive, so move to the previous key. - source.Prev() - } - } else { - // Past the end of the db, move to the last key. - source.SeekToLast() - } - } - +func prefixIterator(source *grocksdb.Iterator, prefix []byte) *iterator { + if prefix == nil { + source.SeekToFirst() + } else { + source.Seek(prefix) } return &iterator{ source: source, - start: start, - end: end, - reverse: reverse, + prefix: prefix, invalid: !source.Valid(), } } @@ -73,38 +47,27 @@ func copyAndFreeSlice(s *grocksdb.Slice) []byte { } func (itr *iterator) Valid() bool { - // once invalid, forever invalid + // Once invalid, always invalid. if itr.invalid { return false } - // if source has error, consider it invalid + // Check for errors. if err := itr.source.Err(); err != nil { itr.invalid = true return false } - // if source is invalid, consider it invalid + // If iterator is not valid, we are done. if !itr.source.Valid() { itr.invalid = true return false } - // if key is at the end or past it, consider it invalid - start := itr.start - end := itr.end - key := readOnlySlice(itr.source.Key()) - - if itr.reverse { - if start != nil && bytes.Compare(key, start) < 0 { - itr.invalid = true - return false - } - } else { - if end != nil && bytes.Compare(end, key) <= 0 { - itr.invalid = true - return false - } + // If key does not match prefix, we are done. + if !bytes.HasPrefix(readOnlySlice(itr.source.Key()), itr.prefix) { + itr.invalid = true + return false } return true @@ -125,11 +88,7 @@ func (itr iterator) Next() bool { return false } - if itr.reverse { - itr.source.Prev() - } else { - itr.source.Next() - } + itr.source.Next() return itr.Valid() } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 8f105bdd5f2..7b83ccc135d 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -1,9 +1,8 @@ -// Package badger provides a RocksDB-backed node database. +// Package rocksdb provides a RocksDB-backed node database. package rocksdb import ( "context" - "encoding/binary" "fmt" "runtime" "sync" @@ -32,19 +31,19 @@ var ( // rootsMetadataKeyFmt is the key format for roots metadata. The key format is (version). // // Value is CBOR-serialized rootsMetadata. - rootsMetadataKeyFmt = keyformat.New(0x02, uint64(0)) + rootsMetadataKeyFmt = keyformat.New(0x00, uint64(0)) // rootUpdatedNodesKeyFmt is the key format for the pending updated nodes for the // given root that need to be removed only in case the given root is not among // the finalized roots. They key format is (version, root). // // Value is CBOR-serialized []updatedNode. - rootUpdatedNodesKeyFmt = keyformat.New(0x03, uint64(0), &node.TypedHash{}) + rootUpdatedNodesKeyFmt = keyformat.New(0x01, uint64(0), &node.TypedHash{}) // metadataKeyFmt is the key format for metadata. // // Value is CBOR-serialized metadata. - metadataKeyFmt = keyformat.New(0x04) + metadataKeyFmt = keyformat.New(0x02) // multipartRestoreNodeLogKeyFmt is the key format for the nodes inserted during a chunk restore. // Once a set of chunks is fully restored, these entries should be removed. If chunk restoration @@ -52,7 +51,7 @@ var ( // with these entries. // // Value is empty. - multipartRestoreNodeLogKeyFmt = keyformat.New(0x05, &node.TypedHash{}) + multipartRestoreNodeLogKeyFmt = keyformat.New(0x03, &node.TypedHash{}) ) // Node CF keys (timestamped). @@ -62,7 +61,6 @@ var ( // Value is serialized node. nodeKeyFmt = keyformat.New(0x00, &hash.Hash{}) - // TODO: separate CF? // writeLogKeyFmt is the key format for write logs (version, new root, // old root). // @@ -72,7 +70,7 @@ var ( // rootNodeKeyFmt is the key format for root nodes (node hash). // // Value is empty. - rootNodeKeyFmt = keyformat.New(0x06, &node.TypedHash{}) + rootNodeKeyFmt = keyformat.New(0x02, &node.TypedHash{}) ) var ( @@ -96,9 +94,10 @@ func New(cfg *api.Config) (api.NodeDB, error) { readOnly: cfg.ReadOnly, } - // XXX: Most of these options were taken from cosmos SDK. + // XXX: Most of these were taken from Cosmos-SDK RocksDB impl. // Experiment/modify if needed. Most of these can be adjusted // on a live database. + // Also see: https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide // Create options for the metadata column family. optsMeta := grocksdb.NewDefaultOptions() @@ -354,7 +353,6 @@ func (d *rocksdbNodeDB) GetNode(root node.Root, ptr *node.Pointer) (node.Node, e } defer s.Free() if !s.Exists() { - fmt.Println("fail here", root.Version) return nil, api.ErrNodeNotFound } @@ -419,10 +417,8 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node wl, err := func() (writelog.Iterator, error) { // Iterate over all write logs that result in the current item. - start := writeLogKeyFmt.Encode(endRoot.Version, &curItem.endRootHash) - - // TODO: maybe support prefix iterator? (manually configure start & end). - it := newIterator(d.db.NewIteratorCF(timestampReadOptions(endRoot.Version), d.cfNode), start, nil, false) + prefix := writeLogKeyFmt.Encode(endRoot.Version, &curItem.endRootHash) + it := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(endRoot.Version), d.cfNode), prefix) defer it.Close() for ; it.Valid(); it.Next() { @@ -440,10 +436,6 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node if !writeLogKeyFmt.Decode(key, &decVersion, &decEndRootHash, &decStartRootHash) { return nil, nil } - // TODO: check other such places. - if decVersion != endRoot.Version || !decEndRootHash.Equal(&curItem.endRootHash) { - return nil, nil - } nextItem := wlItem{ depth: curItem.depth + 1, @@ -568,7 +560,7 @@ func (d *rocksdbNodeDB) HasRoot(root node.Root) bool { func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { if len(roots) == 0 { - return fmt.Errorf("mkvs/badger: need at least one root to finalize") + return fmt.Errorf("mkvs/rocksdb: need at least one root to finalize") } version := roots[0].Version @@ -594,7 +586,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { finalizedRoots := make(map[node.TypedHash]bool) for _, root := range roots { if root.Version != version { - return fmt.Errorf("mkvs/badger: roots to finalize don't have matching versions") + return fmt.Errorf("mkvs/rocksdb: roots to finalize don't have matching versions") } finalizedRoots[node.TypedHashFromRoot(root)] = true } @@ -651,9 +643,9 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { var updatedNodes []updatedNode if err := cbor.UnmarshalTrusted(item.Data(), &updatedNodes); err != nil { - panic(fmt.Errorf("mkvs/badger: corrupted root updated nodes index: %w", err)) + panic(fmt.Errorf("mkvs/rocksdb: corrupted root updated nodes index: %w", err)) } - item.Free() // TODO: wrapper. + item.Free() if finalizedRoots[rootHash] { // Make sure not to remove any nodes shared with finalized roots. @@ -665,7 +657,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { } } } else { - // Remove any non-finalized roots. It is safe to remove these nodes as Badger's version + // Remove any non-finalized roots. It is safe to remove these nodes as RocksDB's version // control will make sure they are not removed if they are resurrected in any later // version as long as we make sure that these nodes are not shared with any finalized // roots added in the same version. @@ -680,31 +672,14 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // Remove write logs for the non-finalized root. if !d.discardWriteLogs { - fmt.Println("DISCARDING HERE") if err = func() error { rootWriteLogsPrefix := writeLogKeyFmt.Encode(version, &rootHash) - fmt.Println("Prefix: ", version, rootWriteLogsPrefix) - wit := newIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), rootWriteLogsPrefix, nil, false) + wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), rootWriteLogsPrefix) defer wit.Close() - fmt.Println("AA", wit.Valid()) // cf := d.getColumnFamilyForType(rootHash.Type()) for ; wit.Valid(); wit.Next() { - fmt.Println("COME HERE") - key := wit.Key() - - var decVersion uint64 - var decRootHash node.TypedHash - var decRootHash2 node.TypedHash - if !writeLogKeyFmt.Decode(key, &decVersion, &decRootHash, &decRootHash2) { - return nil - } - if decVersion != version || !decRootHash.Equal(&rootHash) { - return nil - } - - fmt.Println("DELETING HERE", ts, d.cfNode) - batch.DeleteCFWithTS(d.cfNode, key, ts[:]) + batch.DeleteCFWithTS(d.cfNode, wit.Key(), ts[:]) } return nil }(); err != nil { @@ -739,7 +714,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // Commit batch. if err := d.db.Write(defaultWriteOptions, batch); err != nil { - return fmt.Errorf("mkvs/badger: failed to commit finalized roots: %w", err) + return fmt.Errorf("mkvs/rocksdb: failed to commit finalized roots: %w", err) } // Clean multipart metadata if there is any. @@ -803,11 +778,12 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { return false } - // TODO: Extract. - itemTS := binary.LittleEndian.Uint64(ts.Data()) - defer ts.Free() - - if itemTS == version { + itemTs, err := versionFromTimestamp(ts) + if err != nil { + // Shouldn't happen unless corrupted db. + panic(fmt.Errorf("mkvs/rocksdb: missing/corrupted timestamp for node: %s", h)) + } + if itemTs == version { batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts.Data()) } return true @@ -824,24 +800,11 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { // Prune all write logs in version. if !d.discardWriteLogs { - prefix := writeLogKeyFmt.Encode(version) - wit := newIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), prefix, nil, false) + wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), writeLogKeyFmt.Encode(version)) defer wit.Close() for ; wit.Valid(); wit.Next() { - key := wit.Key() - - var decVersion uint64 - var decRootHash node.TypedHash - var decRootHash2 node.TypedHash - if !writeLogKeyFmt.Decode(key, &decVersion, &decRootHash, &decRootHash2) { - break - } - if decVersion != version { - break - } - - batch.DeleteCFWithTS(d.cfNode, key, ts[:]) // HM? seems wrong. + batch.DeleteCFWithTS(d.cfNode, wit.Key(), ts[:]) } } @@ -911,7 +874,7 @@ func (d *rocksdbNodeDB) cleanMultipartLocked(removeNodes bool) error { return nil } - it := newIterator(d.db.NewIterator(defaultReadOptions), multipartRestoreNodeLogKeyFmt.Encode(), nil, false) + it := prefixIterator(d.db.NewIterator(defaultReadOptions), multipartRestoreNodeLogKeyFmt.Encode()) defer it.Close() batch := grocksdb.NewWriteBatch() @@ -933,11 +896,9 @@ func (d *rocksdbNodeDB) cleanMultipartLocked(removeNodes bool) error { } switch hash.Type() { case node.RootTypeInvalid: - fmt.Println("REMOVING NODE at TS", ts) h := hash.Hash() batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts[:]) default: - fmt.Println("REMOVING ROOT NODE at TS", ts, hash.Hash()) // cf := d.getColumnFamilyForType(hash.Type()) batch.DeleteCFWithTS(d.cfNode, rootNodeKeyFmt.Encode(&hash), ts[:]) } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index d9f2585d74f..0550b352674 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -107,19 +107,13 @@ func createCheckpoint(ctx context.Context, require *require.Assertions, dir stri require.NoError(err, "CreateCheckpoint()") nodeKeys := keySet{} - it := newIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(2), rocksdb.cfNode), nil, nil, false) + it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(2), rocksdb.cfNode), nil) defer it.Close() for ; it.Valid(); it.Next() { - // TODO: maybe gotta strip version. - if len(values) == 1 { - fmt.Println("AAAAa", it.Key()) - } - if bytes.HasPrefix(it.Key(), nodePrefix) { nodeKeys[string(it.Key())] = struct{}{} } } - fmt.Println("node keys", len(values), version, len(nodeKeys)) return ckMeta, nodeKeys } @@ -130,7 +124,7 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui notVisited[k] = struct{}{} } - it := newIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), rocksdb.cfNode), nil, nil, false) + it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), rocksdb.cfNode), nil) defer it.Close() for ; it.Valid(); it.Next() { key := it.Key() @@ -138,7 +132,6 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui continue } _, ok := keySet[string(key)] - fmt.Println(version, key) require.Equal(true, ok, "unexpected node in db") delete(notVisited, string(key)) } @@ -146,7 +139,7 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui } func checkNoLogKeys(require *require.Assertions, rocksdb *rocksdbNodeDB) { - it := newIterator(rocksdb.db.NewIterator(defaultReadOptions), nil, nil, false) + it := prefixIterator(rocksdb.db.NewIterator(defaultReadOptions), nil) defer it.Close() for ; it.Valid(); it.Next() { require.False(bytes.HasPrefix(it.Key(), logPrefix), "checkLogKeys()/iteration") @@ -199,9 +192,7 @@ func TestMultipartRestore(t *testing.T) { require.NoError(err, "TempDir()") defer os.RemoveAll(dir) - fmt.Println("Creating checkpoint with", initialValues) ckMeta, ckNodes := createCheckpoint(ctx, require, dir, initialValues, 1) - fmt.Println("Got:", ckNodes) dbCfg := *dbCfg dbCfg.DB = dir @@ -271,10 +262,8 @@ func testExistingNodes(ctx *test) { // Create the checkpoint to be used as the overriding restore. ckMeta2, ckNodes2 := createCheckpoint(ctx.ctx, ctx.require, ctx.dir, testValues, 2) - fmt.Println(ctx.ckNodes) var overlap bool for node1 := range ctx.ckNodes { - fmt.Println("IM HERE", node1) if _, ok := ckNodes2[node1]; ok { overlap = true break diff --git a/go/storage/mkvs/db/rocksdb/timestamp.go b/go/storage/mkvs/db/rocksdb/timestamp.go index dbba60ad1c7..bbaf3414aad 100644 --- a/go/storage/mkvs/db/rocksdb/timestamp.go +++ b/go/storage/mkvs/db/rocksdb/timestamp.go @@ -3,6 +3,7 @@ package rocksdb import ( "bytes" "encoding/binary" + "fmt" "github.com/linxGnu/grocksdb" ) @@ -76,3 +77,11 @@ func timestampReadOptions(version uint64) *grocksdb.ReadOptions { return readOpts } + +func versionFromTimestamp(ts *grocksdb.Slice) (uint64, error) { + if !ts.Exists() { + return 0, fmt.Errorf("timestamp empty") + } + defer ts.Free() + return binary.LittleEndian.Uint64(ts.Data()), nil +} From 44c2326dce675ba312d86bc2201cb7d14d136600 Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 11:17:45 +0100 Subject: [PATCH 03/28] mkvs/rocksdb: split io and state tree --- go/.golangci.yml | 1 + go/common/keyformat/key_format.go | 2 +- go/nexus-genesis.json | 300 --------------------- go/storage/mkvs/db/badger/migrate_test.go | 4 +- go/storage/mkvs/db/db_test.go | 2 +- go/storage/mkvs/db/rocksdb/batch.go | 29 +- go/storage/mkvs/db/rocksdb/rocksdb.go | 209 +++++++------- go/storage/mkvs/db/rocksdb/rocksdb_test.go | 42 ++- go/storage/mkvs/db/rocksdb/timestamp.go | 20 +- 9 files changed, 158 insertions(+), 451 deletions(-) delete mode 100644 go/nexus-genesis.json diff --git a/go/.golangci.yml b/go/.golangci.yml index 13c3ddaaf2e..a0ab6640bba 100644 --- a/go/.golangci.yml +++ b/go/.golangci.yml @@ -48,6 +48,7 @@ linters-settings: - github.com/stretchr - github.com/tidwall/btree - github.com/tyler-smith/go-bip39 + - github.com/linxGnu/grocksdb linters: disable-all: true diff --git a/go/common/keyformat/key_format.go b/go/common/keyformat/key_format.go index 8f3af2d9da1..82fa35a5c41 100644 --- a/go/common/keyformat/key_format.go +++ b/go/common/keyformat/key_format.go @@ -157,7 +157,7 @@ func (k *KeyFormat) Encode(values ...interface{}) []byte { panic(fmt.Sprintf("key format: failed to marshal element %d: %s", i, err)) } if len(data) != meta.size { - panic(fmt.Sprintf("key format: unexpected marshalled size %d for element %d (expected: %d)", len(data), i, meta.size)) + panic(fmt.Sprintf("key format: unexpected marshalled size %d for element %d", len(data), i)) } copy(buf[:], data) diff --git a/go/nexus-genesis.json b/go/nexus-genesis.json deleted file mode 100644 index 5815b30aab8..00000000000 --- a/go/nexus-genesis.json +++ /dev/null @@ -1,300 +0,0 @@ -{ - "height": 1, - "genesis_time": "2022-09-26T04:06:00.331342488Z", - "chain_id": "oasis-3", - "comment ^": "This is not the real mainnet. However, we match its chain_id so that we can test sql migrations (which use the chain_id as part of the fully qualified table names)", - "registry": { - "params": { - "debug_allow_unroutable_addresses": true, - "debug_allow_test_runtimes": true, - "gas_costs": { - "deregister_entity": 1000, - "register_entity": 1000, - "register_node": 1000, - "register_runtime": 1000, - "runtime_epoch_maintenance": 1000, - "unfreeze_node": 1000, - "update_keymanager": 1000 - }, - "max_node_expiration": 5, - "enable_runtime_governance_models": { - "entity": true, - "runtime": true - } - }, - "entities": [ - { - "untrusted_raw_value": "o2F2AmJpZFggJTUtHd4XYQjh//e6eYU7Pa/XMFG88WE+jixvceIfWrllbm9kZXOBWCAtC7hm0WDw4nQwLgzhAx5RHsizpe3gD8Jb48r/tM+IfQ==", - "signature": { - "public_key": "JTUtHd4XYQjh//e6eYU7Pa/XMFG88WE+jixvceIfWrk=", - "signature": "L9wqH/IjJ3AdgledgK/1qU86f5kKjWy/zpd3cS8YkOYZyQBi+z98wNANy6ACiW3kpAD5uI/qcTg/ez+nE7dJCw==" - } - }, - { - "untrusted_raw_value": "omF2AmJpZFgg+MJpnSTzc11dNI5emMa+asCJH5cxBiBCcpbYE4XBdso=", - "signature": { - "public_key": "+MJpnSTzc11dNI5emMa+asCJH5cxBiBCcpbYE4XBdso=", - "signature": "LXp4Fl89LDJyqHR92PWfIN5yod+eOwZBfresKfiEoMReuZugzbIFjMuBrC8ruazd/UAbfU6r6MPIM4H6YQOvCA==" - } - }, - { - "untrusted_raw_value": "omF2AmJpZFggHc0T6ypN7Ytv3t+n5LyJjCd93geUMTo82BR8iS1sRkY=", - "signature": { - "public_key": "Hc0T6ypN7Ytv3t+n5LyJjCd93geUMTo82BR8iS1sRkY=", - "signature": "IRxC/BtxNGVjN21fzNGu/rNm1FN0wTwdrffEUJzigPOuSaP6dxNtxxKYWm89AMIByc29g2qtv33UPImDwH0lDQ==" - } - }, - { - "untrusted_raw_value": "omF2AmJpZFggBTDHzkZYnKPQKUcnN4ieYpLBKRLMu/tmktwpbLduAj8=", - "signature": { - "public_key": "BTDHzkZYnKPQKUcnN4ieYpLBKRLMu/tmktwpbLduAj8=", - "signature": "VDTt/fqiGFjFYUvAr7ar2LMSzlb5FEQnjzpJYrgUlhdFKukJI3cyCu0RApy9d4LM+eiv3L8uj5nmHLItovX2Aw==" - } - }, - { - "untrusted_raw_value": "omF2AmJpZFggkNGfL2S9wpni8G2phHcbUYRmnwSsZFny90VFTMXvDKo=", - "signature": { - "public_key": "kNGfL2S9wpni8G2phHcbUYRmnwSsZFny90VFTMXvDKo=", - "signature": "Dpa4leeFoFLd2OK5AsxqVghSivB8p2dibzVERJgODzGZDcoFpTurTX2g+MuIEoIP4TjWWr432l4i9lW1nniJAg==" - } - }, - { - "untrusted_raw_value": "omF2AmJpZFggTqUyj5Q+9vZtqu10yw6Zw7HEX3Ywe0JQA9vHyzY47TU=", - "signature": { - "public_key": "TqUyj5Q+9vZtqu10yw6Zw7HEX3Ywe0JQA9vHyzY47TU=", - "signature": "UiuPWSjYdXih3iEGBTPyWTWoXb4cXfPQi4xnrlgooXpin876uZe4Uy3D5tLYrqDCGpDJ+r/8r0gTJ6aIwCuvBQ==" - } - } - ], - "nodes": [ - { - "untrusted_raw_value": "q2F2AmJpZFggLQu4ZtFg8OJ0MC4M4QMeUR7Is6Xt4A/CW+PK/7TPiH1jcDJwomJpZFgg3NaiXoRM24g/ICmKIG3/UO0OQxe+2irGUZ7rWh8J+TBpYWRkcmVzc2Vz9mN0bHOjZ3B1Yl9rZXlYIJ7u/tQnAGPpg60PtX1KvTS5WvJ7xPLQtiO4rd3x0jOjaWFkZHJlc3Nlc/ZsbmV4dF9wdWJfa2V5WCC/5Vy8kvE846VcbsK3Haujk9mJKeBnKBnn45riiVdgemN2cmahYmlkWCBKt2yZ3y0U29agfBjZ6cyrZE2gQIHBOS0zBrZ5wMbKLmVyb2xlcwhocnVudGltZXP2aWNvbnNlbnN1c6JiaWRYIChZHJhhMXcGCXa4Jw9tMG5wERuu5KB0GZOVx0iJhjekaWFkZHJlc3Nlc4GiYmlkWCDc1qJehEzbiD8gKYogbf9Q7Q5DF77aKsZRnutaHwn5MGdhZGRyZXNzo2JJUFAAAAAAAAAAAAAA//9/AAABZFBvcnQZTiFkWm9uZWBpZW50aXR5X2lkWCAlNS0d3hdhCOH/97p5hTs9r9cwUbzxYT6OLG9x4h9auWpleHBpcmF0aW9uAXBzb2Z0d2FyZV92ZXJzaW9uZzIyLjEuMTA=", - "signatures": [ - { - "public_key": "LQu4ZtFg8OJ0MC4M4QMeUR7Is6Xt4A/CW+PK/7TPiH0=", - "signature": "GYM39hlnmmJFP0CjQqxfQnJUC/QMqEC8HGHuJaZVIBxTWMM87qjdbic6q0qQTS3F/MPm33WYCnBd2WJx0rKdAw==" - }, - { - "public_key": "3NaiXoRM24g/ICmKIG3/UO0OQxe+2irGUZ7rWh8J+TA=", - "signature": "i052fsAWa/t6QscGq7FqGwB70Fq9/p/ed2nnAPFlcba1LMZ83lzSLTlvZ/M1z8Gz4AlEJqI0unPqOiUndIxeAQ==" - }, - { - "public_key": "KFkcmGExdwYJdrgnD20wbnARG67koHQZk5XHSImGN6Q=", - "signature": "4/UsmJNxa8qhTYInLd3cRkDqJhtXpM136DfoZJxcWTomHeMeGQG6xHuAwuFiWl8PtAsLnOYWDErUxJ0qI/PtCw==" - }, - { - "public_key": "Srdsmd8tFNvWoHwY2enMq2RNoECBwTktMwa2ecDGyi4=", - "signature": "wKaq0rD0FrC58ZsIC0L3WI9/pys6PpvMAUpVsRmmMMly9fLgBrEcO30uNwSxgPRwCrA0efVUX189nLAjVgzkBw==" - }, - { - "public_key": "nu7+1CcAY+mDrQ+1fUq9NLla8nvE8tC2I7it3fHSM6M=", - "signature": "vbjuhiDjBg6FbOZOIN6ac7X9enMVCU4kkGz/7oYPG6gAKAhq3bb3A+VdJvULVU+9CMODNV8DI5sKh0xR6eJHBA==" - } - ] - } - ] - }, - "roothash": { - "params": { - "gas_costs": { - "compute_commit": 1000, - "evidence": 1000, - "proposer_timeout": 1000, - "submit_msg": 1000 - }, - "max_runtime_messages": 128, - "max_in_runtime_messages": 128, - "max_evidence_age": 0 - } - }, - "staking": { - "params": { - "thresholds": { - "entity": "0", - "node-compute": "0", - "node-keymanager": "0", - "node-validator": "0", - "runtime-compute": "0", - "runtime-keymanager": "0", - "node-observer": "0" - }, - "debonding_interval": 1, - "commission_schedule_rules": {}, - "min_delegation": "0", - "min_transfer": "0", - "min_transact_balance": "0", - "fee_split_weight_propose": "0", - "fee_split_weight_vote": "1", - "fee_split_weight_next_propose": "0", - "reward_factor_epoch_signed": "0", - "reward_factor_block_proposed": "0" - }, - "token_symbol": "TEST", - "token_value_exponent": 6, - "total_supply": "5200000000000", - "common_pool": "0", - "last_block_fees": "0", - "governance_deposits": "0", - "ledger": { - "oasis1qqczr9vgvp9gysgv0jx3ywww4gccyhqq3g8aygw4": { - "general": { - "balance": "1000000000000" - }, - "escrow": { - "active": { - "balance": "0", - "total_shares": "0" - }, - "debonding": { - "balance": "0", - "total_shares": "0" - }, - "commission_schedule": {}, - "stake_accumulator": {} - } - }, - "oasis1qqncl383h8458mr9cytatygctzwsx02n4c5f8ed7": { - "general": { - "balance": "1000000000000" - }, - "escrow": { - "active": { - "balance": "0", - "total_shares": "0" - }, - "debonding": { - "balance": "0", - "total_shares": "0" - }, - "commission_schedule": {}, - "stake_accumulator": {} - } - }, - "oasis1qqw3ka3eeuy5qaytyhesxtj4fe5pp0xkdy954uwk": { - "general": { - "balance": "1000000000000" - }, - "escrow": { - "active": { - "balance": "0", - "total_shares": "0" - }, - "debonding": { - "balance": "0", - "total_shares": "0" - }, - "commission_schedule": {}, - "stake_accumulator": {} - } - }, - "oasis1qrz6kjp9lu6vc6snhlszq3p2nlx76qasaqr2auqk": { - "general": { - "balance": "1000000000000" - }, - "escrow": { - "active": { - "balance": "0", - "total_shares": "0" - }, - "debonding": { - "balance": "0", - "total_shares": "0" - }, - "commission_schedule": {}, - "stake_accumulator": {} - } - }, - "oasis1qznshq4ttrgh83d9wqvgmsuq3pfsndg3tus7lx98": { - "general": { - "balance": "1000000000000" - }, - "escrow": { - "active": { - "balance": "0", - "total_shares": "0" - }, - "debonding": { - "balance": "0", - "total_shares": "0" - }, - "commission_schedule": {}, - "stake_accumulator": {} - } - }, - "oasis1qzzd6khm3acqskpxlk9vd5044cmmcce78y5l6000": { - "general": { - "balance": "100000000000" - }, - "escrow": { - "active": { - "balance": "100000000000", - "total_shares": "1" - }, - "debonding": { - "balance": "0", - "total_shares": "0" - }, - "commission_schedule": {}, - "stake_accumulator": {} - } - } - }, - "delegations": { - "oasis1qzzd6khm3acqskpxlk9vd5044cmmcce78y5l6000": { - "oasis1qzzd6khm3acqskpxlk9vd5044cmmcce78y5l6000": { - "shares": "1" - } - } - } - }, - "keymanager": {}, - "scheduler": { - "params": { - "min_validators": 1, - "max_validators": 100, - "max_validators_per_entity": 1, - "reward_factor_epoch_election_any": "0" - } - }, - "beacon": { - "base": 0, - "params": { - "backend": "insecure", - "insecure_parameters": { - "interval": 30 - } - } - }, - "governance": { - "params": { - "gas_costs": { - "cast_vote": 1000, - "submit_proposal": 1000 - }, - "min_proposal_deposit": "100", - "voting_period": 100, - "stake_threshold": 90, - "upgrade_min_epoch_diff": 300, - "upgrade_cancel_min_epoch_diff": 300 - } - }, - "consensus": { - "backend": "tendermint", - "params": { - "timeout_commit": 1000000000, - "skip_timeout_commit": false, - "empty_block_interval": 0, - "max_tx_size": 32768, - "max_block_size": 22020096, - "max_block_gas": 0, - "max_evidence_size": 1048576, - "state_checkpoint_interval": 0, - "state_checkpoint_chunk_size": 8388608, - "gas_costs": { - "tx_byte": 0 - } - } - }, - "halt_epoch": 86400, - "extra_data": null -} diff --git a/go/storage/mkvs/db/badger/migrate_test.go b/go/storage/mkvs/db/badger/migrate_test.go index cd180d972df..140341e868a 100644 --- a/go/storage/mkvs/db/badger/migrate_test.go +++ b/go/storage/mkvs/db/badger/migrate_test.go @@ -452,7 +452,7 @@ func TestBadgerV5KeyVersioning(t *testing.T) { } var h hash.Hash - var th1, th2 typedHash + var th1, th2 node.TypedHash var v uint64 for it.Rewind(); it.Valid(); it.Next() { @@ -497,7 +497,7 @@ func prettyPrintDBV5(ndb api.NodeDB) { // nolint: deadcode, unused defer it.Close() var h hash.Hash - var th1, th2 typedHash + var th1, th2 node.TypedHash var v uint64 for it.Rewind(); it.Valid(); it.Next() { diff --git a/go/storage/mkvs/db/db_test.go b/go/storage/mkvs/db/db_test.go index d6d7c078341..d36a5cccaca 100644 --- a/go/storage/mkvs/db/db_test.go +++ b/go/storage/mkvs/db/db_test.go @@ -268,7 +268,7 @@ func testVersionChecks(t *testing.T, new NodeDBFactory) { err = ndb.StartMultipartInsert(44) require.Error(err, "StartMultipartInsert(44)") - root := node.Root{} + root := node.Root{Type: node.RootTypeState} _, err = ndb.NewBatch(root, 0, false) // Normal chunks not allowed during multipart. require.Error(err, "NewBatch(.., 0, false)") _, err = ndb.NewBatch(root, 13, true) diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index 453a40ef0a1..21485b8bc36 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -24,7 +24,8 @@ type rocksdbBatch struct { oldRoot node.Root chunk bool - version uint64 + version uint64 + rootType node.RootType writeLog writelog.WriteLog annotations writelog.Annotations @@ -58,13 +59,12 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { return err } - // cf := ba.db.getColumnFamilyForRoot(root) - + cf := ba.db.getColumnFamilyForRoot(root) rootHash := node.TypedHashFromRoot(root) ts := timestampFromVersion(root.Version) - ba.bat.PutCFWithTS(ba.db.cfNode, rootNodeKeyFmt.Encode(&rootHash), ts[:], []byte{}) + ba.bat.PutCFWithTS(cf, rootNodeKeyFmt.Encode(&rootHash), ts[:], []byte{}) if ba.multipartNodes != nil { - ba.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&rootHash), []byte{}) + ba.multipartNodes.Put(multipartRestoreRootLogKeyFmt.Encode(&rootHash), []byte{}) } if rootsMeta.Roots[rootHash] != nil { @@ -94,17 +94,13 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { return api.ErrPreviousVersionMismatch } - // TODO: LongKeys. - // Old code re-loaded loadRootsMetadata here (which was saved in line 84). However i think this is not needed. - // More-over we lose the updates here, since the batch was not yet submitted, this differs with badger transaction - // semantics. Maybe we should use transactions here, idk. var oldRootsMeta *rootsMetadata oldRootsMeta, err = loadRootsMetadata(ba.db.db, ba.oldRoot.Version) if err != nil { return err } - // Check if overridden in the current WriteBatch. - // TODO: this is probably not needed, just pick rootsMeta here? + // Check if oldRootsMeta was updated in this batch. + // TODO: could this be avoided? wbIter := ba.bat.NewIterator() for { if !wbIter.Next() { @@ -137,7 +133,7 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { log := api.MakeHashedDBWriteLog(ba.writeLog, ba.annotations) bytes := cbor.Marshal(log) key := writeLogKeyFmt.Encode(root.Version, &rootHash, &oldRootHash) - ba.bat.PutCFWithTS(ba.db.cfNode, key, ts[:], bytes) + ba.bat.PutCFWithTS(cf, key, ts[:], bytes) } } @@ -159,7 +155,7 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { } // MaybeStartSubtree implements api.Batch. -func (ba *rocksdbBatch) MaybeStartSubtree(subtree api.Subtree, depth node.Depth, subtreeRoot *node.Pointer) api.Subtree { +func (ba *rocksdbBatch) MaybeStartSubtree(subtree api.Subtree, _ node.Depth, _ *node.Pointer) api.Subtree { if subtree == nil { return &rocksdbSubtree{batch: ba} } @@ -216,23 +212,24 @@ func (s *rocksdbSubtree) PutNode(_ node.Depth, ptr *node.Pointer) error { return err } + cf := s.batch.db.getColumnFamilyForType(s.batch.rootType) h := ptr.Node.GetHash() s.batch.updatedNodes = append(s.batch.updatedNodes, updatedNode{Hash: h}) nodeKey := nodeKeyFmt.Encode(&h) if s.batch.multipartNodes != nil { - item, err := s.batch.db.db.GetCF(timestampReadOptions(s.batch.version), s.batch.db.cfNode, nodeKey) + item, err := s.batch.db.db.GetCF(timestampReadOptions(s.batch.version), cf, nodeKey) if err != nil { return err } defer item.Free() if !item.Exists() { - th := node.TypedHashFromParts(node.RootTypeInvalid, h) + th := node.TypedHashFromParts(s.batch.rootType, h) s.batch.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&th), []byte{}) } } ts := timestampFromVersion(s.batch.version) - s.batch.bat.PutCFWithTS(s.batch.db.cfNode, nodeKey, ts[:], data) + s.batch.bat.PutCFWithTS(cf, nodeKey, ts[:], data) return nil } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 7b83ccc135d..2d4d7138ea9 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -52,9 +52,17 @@ var ( // // Value is empty. multipartRestoreNodeLogKeyFmt = keyformat.New(0x03, &node.TypedHash{}) + + // multipartRestoreNodeLogKeyFmt is the key format for the root nodes inserted during a chunk restore. + // Once a set of chunks is fully restored, these entries should be removed. If chunk restoration + // is interrupted for any reason, the nodes associated with these keys should be removed, along + // with these entries. + // + // Value is empty. + multipartRestoreRootLogKeyFmt = keyformat.New(0x04, &node.TypedHash{}) ) -// Node CF keys (timestamped). +// Node CF keys (timestamped and used by state and io tree CFs). var ( // nodeKeyFmt is the key format for nodes (node hash). // @@ -79,10 +87,9 @@ var ( ) const ( - cfMetadataName = "default" - cfNodeTree = "node" - // cfStateTreeName = "state_tree" - // cfIOTreeName = "io_tree" + cfMetadataName = "default" + cfStateTreeName = "state_tree" + cfIOTreeName = "io_tree" ) // New creates a new RocksDB-backed node database. @@ -100,6 +107,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { // Also see: https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide // Create options for the metadata column family. + // TODO: Consider some tuning for meta options. optsMeta := grocksdb.NewDefaultOptions() optsMeta.SetCreateIfMissing(true) optsMeta.SetCreateIfMissingColumnFamilies(true) @@ -152,14 +160,13 @@ func New(cfg *api.Config) (api.NodeDB, error) { cfg.DB, []string{ cfMetadataName, - cfNodeTree, - // cfStateTreeName, - // cfIOTreeName, + cfStateTreeName, + cfIOTreeName, }, []*grocksdb.Options{ optsMeta, optsNodes, - // optsNodes, + optsNodes, }, false) case false: @@ -168,14 +175,13 @@ func New(cfg *api.Config) (api.NodeDB, error) { cfg.DB, []string{ cfMetadataName, - cfNodeTree, - // cfStateTreeName, - // cfIOTreeName, + cfStateTreeName, + cfIOTreeName, }, []*grocksdb.Options{ optsMeta, optsNodes, - // optsNodes, + optsNodes, }, ) } @@ -183,9 +189,8 @@ func New(cfg *api.Config) (api.NodeDB, error) { return nil, fmt.Errorf("mkvs/rocksdb: failed to open database: %w", err) } db.cfMetadata = cfHandles[0] // Also the default handle. - db.cfNode = cfHandles[1] - // db.cfStateTree = cfHandles[1] - // db.cfIOTree = cfHandles[2] + db.cfStateTree = cfHandles[1] + db.cfIOTree = cfHandles[2] // Load database metadata. if err = db.load(); err != nil { @@ -217,28 +222,18 @@ type rocksdbNodeDB struct { discardWriteLogs bool - db *grocksdb.DB - cfMetadata *grocksdb.ColumnFamilyHandle - cfNode *grocksdb.ColumnFamilyHandle - // cfStateTree *grocksdb.ColumnFamilyHandle - // cfIOTree *grocksdb.ColumnFamilyHandle + db *grocksdb.DB + cfMetadata *grocksdb.ColumnFamilyHandle + cfStateTree *grocksdb.ColumnFamilyHandle + cfIOTree *grocksdb.ColumnFamilyHandle closeOnce sync.Once } -/* func (d *rocksdbNodeDB) getColumnFamilyForRoot(root node.Root) *grocksdb.ColumnFamilyHandle { - switch root.Type { - case node.RootTypeState: - return d.cfStateTree - case node.RootTypeIO: - return d.cfIOTree - default: - panic(fmt.Errorf("unsupported root type: %s", root.Type)) - } + return d.getColumnFamilyForType(root.Type) } - func (d *rocksdbNodeDB) getColumnFamilyForType(rootType node.RootType) *grocksdb.ColumnFamilyHandle { switch rootType { case node.RootTypeState: @@ -249,7 +244,6 @@ func (d *rocksdbNodeDB) getColumnFamilyForType(rootType node.RootType) *grocksdb panic(fmt.Errorf("unsupported root type: %s", rootType)) } } -*/ func (d *rocksdbNodeDB) load() error { /* @@ -271,7 +265,7 @@ func (d *rocksdbNodeDB) load() error { // Metadata already exists, just load it and verify that it is // compatible with what we have here. - if err := cbor.UnmarshalTrusted(item.Data(), &d.meta.value); err != nil { + if err = cbor.UnmarshalTrusted(item.Data(), &d.meta.value); err != nil { return err } @@ -311,8 +305,9 @@ func (d *rocksdbNodeDB) sanityCheckNamespace(ns common.Namespace) error { func (d *rocksdbNodeDB) checkRoot(root node.Root) error { rootHash := node.TypedHashFromRoot(root) + cf := d.getColumnFamilyForRoot(root) - s, err := d.db.GetCF(timestampReadOptions(root.Version), d.cfNode, rootNodeKeyFmt.Encode(&rootHash)) + s, err := d.db.GetCF(timestampReadOptions(root.Version), cf, rootNodeKeyFmt.Encode(&rootHash)) if err != nil { d.logger.Error("failed to check root existence", "err", err, @@ -346,8 +341,8 @@ func (d *rocksdbNodeDB) GetNode(root node.Root, ptr *node.Pointer) (node.Node, e return nil, err } - // cf := d.getColumnFamilyForRoot(root) - s, err := d.db.GetCF(timestampReadOptions(root.Version), d.cfNode, nodeKeyFmt.Encode(&ptr.Hash)) + cf := d.getColumnFamilyForRoot(root) + s, err := d.db.GetCF(timestampReadOptions(root.Version), cf, nodeKeyFmt.Encode(&ptr.Hash)) if err != nil { return nil, fmt.Errorf("mkvs/rocksdb: failed to get node from backing store: %w", err) } @@ -385,6 +380,8 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node return nil, err } + cf := d.getColumnFamilyForRoot(startRoot) + // Start at the end root and search towards the start root. This assumes that the // chains are not long and that there is not a lot of forks as in that case performance // would suffer. @@ -418,7 +415,7 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node wl, err := func() (writelog.Iterator, error) { // Iterate over all write logs that result in the current item. prefix := writeLogKeyFmt.Encode(endRoot.Version, &curItem.endRootHash) - it := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(endRoot.Version), d.cfNode), prefix) + it := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(endRoot.Version), cf), prefix) defer it.Close() for ; it.Valid(); it.Next() { @@ -462,7 +459,7 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node Hash: nextItem.logRoots[index].Hash(), } - item, err := d.db.GetCF(timestampReadOptions(endRoot.Version), d.cfNode, key) + item, err := d.db.GetCF(timestampReadOptions(endRoot.Version), cf, key) if err != nil || !item.Exists() { return node.Root{}, nil, err } @@ -558,7 +555,7 @@ func (d *rocksdbNodeDB) HasRoot(root node.Root) bool { return exists } -func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { +func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo if len(roots) == 0 { return fmt.Errorf("mkvs/rocksdb: need at least one root to finalize") } @@ -625,8 +622,8 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { ts := timestampFromVersion(version) // Go through all roots and prune them based on whether they are finalized or not. - maybeLoneNodes := make(map[hash.Hash]bool) - notLoneNodes := make(map[hash.Hash]bool) + maybeLoneNodes := make(map[hash.Hash]node.RootType) + notLoneNodes := make(map[hash.Hash]node.RootType) for rootHash := range rootsMeta.Roots { // TODO: Consider colocating updated nodes with the root metadata. @@ -642,7 +639,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { } var updatedNodes []updatedNode - if err := cbor.UnmarshalTrusted(item.Data(), &updatedNodes); err != nil { + if err = cbor.UnmarshalTrusted(item.Data(), &updatedNodes); err != nil { panic(fmt.Errorf("mkvs/rocksdb: corrupted root updated nodes index: %w", err)) } item.Free() @@ -651,9 +648,9 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // Make sure not to remove any nodes shared with finalized roots. for _, n := range updatedNodes { if n.Removed { - maybeLoneNodes[n.Hash] = true + maybeLoneNodes[n.Hash] = rootHash.Type() } else { - notLoneNodes[n.Hash] = true + notLoneNodes[n.Hash] = rootHash.Type() } } } else { @@ -663,7 +660,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // roots added in the same version. for _, n := range updatedNodes { if !n.Removed { - maybeLoneNodes[n.Hash] = true + maybeLoneNodes[n.Hash] = rootHash.Type() } } @@ -673,13 +670,13 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // Remove write logs for the non-finalized root. if !d.discardWriteLogs { if err = func() error { + cf := d.getColumnFamilyForType(rootHash.Type()) rootWriteLogsPrefix := writeLogKeyFmt.Encode(version, &rootHash) - wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), rootWriteLogsPrefix) + wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), cf), rootWriteLogsPrefix) defer wit.Close() - // cf := d.getColumnFamilyForType(rootHash.Type()) for ; wit.Valid(); wit.Next() { - batch.DeleteCFWithTS(d.cfNode, wit.Key(), ts[:]) + batch.DeleteCFWithTS(cf, wit.Key(), ts[:]) } return nil }(); err != nil { @@ -694,14 +691,11 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // Clean any lone nodes. for h := range maybeLoneNodes { - if notLoneNodes[h] { + if _, ok := notLoneNodes[h]; ok { continue } - // TODO: get CF for hash? - // batch.DeleteCFWithTS(d.cfIOTree, nodeKeyFmt.Encode(&h), ts[:]) - // batch.DeleteCFWithTS(d.cfStateTree, nodeKeyFmt.Encode(&h), ts[:]) - batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts[:]) + batch.DeleteCFWithTS(d.getColumnFamilyForType(maybeLoneNodes[h]), nodeKeyFmt.Encode(&h), ts[:]) } // Save roots metadata if changed. @@ -768,8 +762,9 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { var innerErr error err := api.Visit(ctx, d, root, func(ctx context.Context, n node.Node) bool { h := n.GetHash() + cf := d.getColumnFamilyForRoot(root) - s, ts, err := d.db.GetCFWithTS(timestampReadOptions(root.Version), d.cfNode, nodeKeyFmt.Encode(&h)) + s, ts, err := d.db.GetCFWithTS(timestampReadOptions(root.Version), cf, nodeKeyFmt.Encode(&h)) if err != nil { return false } @@ -784,7 +779,7 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { panic(fmt.Errorf("mkvs/rocksdb: missing/corrupted timestamp for node: %s", h)) } if itemTs == version { - batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts.Data()) + batch.DeleteCFWithTS(cf, nodeKeyFmt.Encode(&h), ts.Data()) } return true }) @@ -800,13 +795,16 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { // Prune all write logs in version. if !d.discardWriteLogs { - wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), d.cfNode), writeLogKeyFmt.Encode(version)) - defer wit.Close() + discardLogs := func(cf *grocksdb.ColumnFamilyHandle) { + wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), cf), writeLogKeyFmt.Encode(version)) + defer wit.Close() - for ; wit.Valid(); wit.Next() { - batch.DeleteCFWithTS(d.cfNode, wit.Key(), ts[:]) + for ; wit.Valid(); wit.Next() { + batch.DeleteCFWithTS(cf, wit.Key(), ts[:]) + } } - + discardLogs(d.cfStateTree) + discardLogs(d.cfIOTree) } // Update metadata. @@ -816,14 +814,11 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { return fmt.Errorf("mkvs/rocksdb: failed to prune version %d: %w", version, err) } - // if err := d.db.IncreaseFullHistoryTsLow(d.cfIOTree, ts[:]); err != nil { - // return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from IO tree: %w", version, err) - // } - // if err := d.db.IncreaseFullHistoryTsLow(d.cfStateTree, ts[:]); err != nil { - // return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from state tree: %w", version, err) - // } - if err := d.db.IncreaseFullHistoryTsLow(d.cfNode, ts[:]); err != nil { - return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from nodes tree: %w", version, err) + if err := d.db.IncreaseFullHistoryTsLow(d.cfIOTree, ts[:]); err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from IO tree: %w", version, err) + } + if err := d.db.IncreaseFullHistoryTsLow(d.cfStateTree, ts[:]); err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to prune version %d from state tree: %w", version, err) } return nil } @@ -874,38 +869,45 @@ func (d *rocksdbNodeDB) cleanMultipartLocked(removeNodes bool) error { return nil } - it := prefixIterator(d.db.NewIterator(defaultReadOptions), multipartRestoreNodeLogKeyFmt.Encode()) - defer it.Close() - batch := grocksdb.NewWriteBatch() defer batch.Destroy() ts := timestampFromVersion(version) var logged bool - for ; it.Valid(); it.Next() { - key := it.Key() - var hash node.TypedHash - if !multipartRestoreNodeLogKeyFmt.Decode(key, &hash) { - break - } + // Clean up the node log. + cleanNodes := func(keyFormat *keyformat.KeyFormat, isRoot bool) { + it := prefixIterator(d.db.NewIterator(defaultReadOptions), keyFormat.Encode()) + defer it.Close() + for ; it.Valid(); it.Next() { + key := it.Key() - if removeNodes { - if !logged { - d.logger.Info("removing some nodes from a multipart restore") - logged = true + var hash node.TypedHash + if !keyFormat.Decode(key, &hash) { + break } - switch hash.Type() { - case node.RootTypeInvalid: - h := hash.Hash() - batch.DeleteCFWithTS(d.cfNode, nodeKeyFmt.Encode(&h), ts[:]) - default: - // cf := d.getColumnFamilyForType(hash.Type()) - batch.DeleteCFWithTS(d.cfNode, rootNodeKeyFmt.Encode(&hash), ts[:]) + cf := d.getColumnFamilyForType(hash.Type()) + + if removeNodes { + if !logged { + d.logger.Info("removing some nodes from a multipart restore") + logged = true + } + + switch isRoot { + case false: + h := hash.Hash() + batch.DeleteCFWithTS(cf, nodeKeyFmt.Encode(&h), ts[:]) + default: + cf := d.getColumnFamilyForType(hash.Type()) + batch.DeleteCFWithTS(cf, rootNodeKeyFmt.Encode(&hash), ts[:]) + } } + // Delete the metadata entry as well. + batch.Delete(key) } - // Delete the metadata entry as well. - batch.Delete(key) } + cleanNodes(multipartRestoreNodeLogKeyFmt, false) + cleanNodes(multipartRestoreRootLogKeyFmt, true) // Apply the batch first. If anything fails, having corrupt // multipart info in d.meta shouldn't hurt us next run. @@ -922,16 +924,12 @@ func (d *rocksdbNodeDB) cleanMultipartLocked(removeNodes bool) error { } func (d *rocksdbNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) (api.Batch, error) { - // WARNING: There is a maximum batch size and maximum batch entry count. - // Both of these things are derived from the MaxTableSize option. - // - // The size limit also applies to normal transactions, so the "right" - // thing to do would be to either crank up MaxTableSize or maybe split - // the transaction out. - if d.readOnly { return nil, api.ErrReadOnly } + if oldRoot.Type != node.RootTypeState && oldRoot.Type != node.RootTypeIO { + return nil, fmt.Errorf("mkvs/rocksdb: unsupported root type: %s", oldRoot.Type) + } d.metaUpdateLock.Lock() defer d.metaUpdateLock.Unlock() @@ -945,14 +943,13 @@ func (d *rocksdbNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) var logBatch *grocksdb.WriteBatch if d.multipartVersion != multipartVersionNone { - // The node log is at a different version than the nodes themselves, - // which is awkward. logBatch = grocksdb.NewWriteBatch() } return &rocksdbBatch{ db: d, version: version, + rootType: oldRoot.Type, bat: grocksdb.NewWriteBatch(), multipartNodes: logBatch, oldRoot: oldRoot, @@ -962,25 +959,23 @@ func (d *rocksdbNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) func (d *rocksdbNodeDB) Size() (uint64, error) { meta := d.db.GetColumnFamilyMetadataCF(d.cfMetadata) - // io := d.db.GetColumnFamilyMetadataCF(d.cfIOTree) - // state := d.db.GetColumnFamilyMetadataCF(d.cfStateTree) - node := d.db.GetColumnFamilyMetadataCF(d.cfNode) + io := d.db.GetColumnFamilyMetadataCF(d.cfIOTree) + state := d.db.GetColumnFamilyMetadataCF(d.cfStateTree) - return meta.Size() + node.Size(), nil // io.Size() + state.Size(), nil + return meta.Size() + io.Size() + state.Size(), nil } func (d *rocksdbNodeDB) Sync() error { opts := grocksdb.NewDefaultFlushOptions() - return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfNode}, opts) + return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfIOTree, d.cfStateTree}, opts) } func (d *rocksdbNodeDB) Close() { d.closeOnce.Do(func() { d.db.Close() d.cfMetadata = nil - // d.cfIOTree = nil - // d.cfStateTree = nil - d.cfNode = nil + d.cfIOTree = nil + d.cfStateTree = nil d.db = nil }) } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index 0550b352674..3b355ed34a4 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -8,6 +8,7 @@ import ( "strconv" "testing" + "github.com/linxGnu/grocksdb" "github.com/stretchr/testify/require" "github.com/oasisprotocol/oasis-core/go/common" @@ -107,13 +108,18 @@ func createCheckpoint(ctx context.Context, require *require.Assertions, dir stri require.NoError(err, "CreateCheckpoint()") nodeKeys := keySet{} - it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(2), rocksdb.cfNode), nil) - defer it.Close() - for ; it.Valid(); it.Next() { - if bytes.HasPrefix(it.Key(), nodePrefix) { - nodeKeys[string(it.Key())] = struct{}{} + + loadNodes := func(cf *grocksdb.ColumnFamilyHandle) { + it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(2), cf), nil) + defer it.Close() + for ; it.Valid(); it.Next() { + if bytes.HasPrefix(it.Key(), nodePrefix) { + nodeKeys[string(it.Key())] = struct{}{} + } } } + loadNodes(rocksdb.cfIOTree) + loadNodes(rocksdb.cfStateTree) return ckMeta, nodeKeys } @@ -124,17 +130,25 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui notVisited[k] = struct{}{} } - it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), rocksdb.cfNode), nil) - defer it.Close() - for ; it.Valid(); it.Next() { - key := it.Key() - if !bytes.HasPrefix(key, nodePrefix) { - continue + checkNodes := func(cf *grocksdb.ColumnFamilyHandle) { + fmt.Println("checking nodes") + it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), cf), nil) + defer it.Close() + for ; it.Valid(); it.Next() { + key := it.Key() + if !bytes.HasPrefix(key, nodePrefix) { + continue + } + _, ok := keySet[string(key)] + fmt.Println(key) + require.Equal(true, ok, "unexpected node in db") + delete(notVisited, string(key)) } - _, ok := keySet[string(key)] - require.Equal(true, ok, "unexpected node in db") - delete(notVisited, string(key)) } + fmt.Println("Verify nodes.....") + checkNodes(rocksdb.cfIOTree) + checkNodes(rocksdb.cfStateTree) + require.Equal(0, len(notVisited), "some nodes not visited") } diff --git a/go/storage/mkvs/db/rocksdb/timestamp.go b/go/storage/mkvs/db/rocksdb/timestamp.go index bbaf3414aad..14c1c2be7ba 100644 --- a/go/storage/mkvs/db/rocksdb/timestamp.go +++ b/go/storage/mkvs/db/rocksdb/timestamp.go @@ -25,7 +25,7 @@ func createTimestampComparator() *grocksdb.Comparator { ) } -// gorocksdb.Comparing. +// Implements gorocksdb.Comparing. func compareTimestampKeys(a, b []byte) int { // First compare keys without timestamps. if ret := compareWithoutTimestamp(a, true, b, true); ret != 0 { @@ -36,7 +36,7 @@ func compareTimestampKeys(a, b []byte) int { return -compareTimestamp(a[len(a)-timestampSize:], b[len(b)-timestampSize:]) } -// gorocksdb.Comparing. +// Implements gorocksdb.Comparing. func compareTimestamp(a, b []byte) int { ts1 := binary.LittleEndian.Uint64(a) ts2 := binary.LittleEndian.Uint64(b) @@ -51,7 +51,7 @@ func compareTimestamp(a, b []byte) int { } } -// gorocksdb.ComparingWithoutTimestamp. +// Implements gorocksdb.ComparingWithoutTimestamp. func compareWithoutTimestamp(a []byte, aHasTs bool, b []byte, bHasTs bool) int { if aHasTs { a = a[:len(a)-timestampSize] @@ -62,13 +62,7 @@ func compareWithoutTimestamp(a []byte, aHasTs bool, b []byte, bHasTs bool) int { return bytes.Compare(a, b) } -func timestampFromVersion(version uint64) [timestampSize]byte { - var ts [timestampSize]byte - binary.LittleEndian.PutUint64(ts[:], version) - return ts -} - -// timestampReadOptions returns ReadOptions used in the RocksDB column family read. +// timestampReadOptions returns the default read options with set read timestamp. func timestampReadOptions(version uint64) *grocksdb.ReadOptions { ts := timestampFromVersion(version) @@ -78,6 +72,12 @@ func timestampReadOptions(version uint64) *grocksdb.ReadOptions { return readOpts } +func timestampFromVersion(version uint64) [timestampSize]byte { + var ts [timestampSize]byte + binary.LittleEndian.PutUint64(ts[:], version) + return ts +} + func versionFromTimestamp(ts *grocksdb.Slice) (uint64, error) { if !ts.Exists() { return 0, fmt.Errorf("timestamp empty") From 0fd7891c9cc58c86fea70f4b49cedca35210ff20 Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 14:15:34 +0100 Subject: [PATCH 04/28] storage/rocksdb: coditionaly build rocksdb --- go/Makefile | 23 ++++++++++++++++++++-- go/go.mod | 8 ++++---- go/go.sum | 17 ++++------------ go/storage/mkvs/db/rocksdb/batch.go | 3 +++ go/storage/mkvs/db/rocksdb/iterator.go | 3 +++ go/storage/mkvs/db/rocksdb/metadata.go | 3 +++ go/storage/mkvs/db/rocksdb/norocksdb.go | 14 +++++++++++++ go/storage/mkvs/db/rocksdb/rocksdb.go | 3 +++ go/storage/mkvs/db/rocksdb/rocksdb_test.go | 3 +++ go/storage/mkvs/db/rocksdb/timestamp.go | 11 +++++++---- 10 files changed, 65 insertions(+), 23 deletions(-) create mode 100644 go/storage/mkvs/db/rocksdb/norocksdb.go diff --git a/go/Makefile b/go/Makefile index eaa1c2daacf..afc65563d3e 100644 --- a/go/Makefile +++ b/go/Makefile @@ -5,9 +5,28 @@ ifneq ($(GOLDFLAGS),) GO_EXTRA_FLAGS += -ldflags $(GOLDFLAGS) endif -# Build code with jemalloc tag unless explicitly disabled (used by badgerdb). +# Initialize GO_TAGS variable to hold the build tags. +GO_TAGS := + +# Include jemalloc tag unless explicitly disabled (used by badgerdb). ifneq ($(OASIS_BADGER_NO_JEMALLOC), 1) - GO_EXTRA_FLAGS += -tags jemalloc + ifneq ($(GO_TAGS),) + GO_TAGS := $(GO_TAGS), + endif + GO_TAGS := $(GO_TAGS)jemalloc +endif + +# Include rocksdb tags unless explicitly disabled. +ifneq ($(OASIS_NO_ROCKSDB), 1) + ifneq ($(GO_TAGS),) + GO_TAGS := $(GO_TAGS), + endif + GO_TAGS := $(GO_TAGS)rocksdb,grocksdb_clean_link +endif + +# If GO_TAGS is not empty, append it to GO_EXTRA_FLAGS. +ifneq ($(GO_TAGS),) + GO_EXTRA_FLAGS += -tags "$(GO_TAGS)" endif # Set all target as the default target. diff --git a/go/go.mod b/go/go.mod index 8a11daa8984..609bb7ab089 100644 --- a/go/go.mod +++ b/go/go.mod @@ -9,6 +9,7 @@ replace ( golang.org/x/crypto/curve25519 => github.com/oasisprotocol/curve25519-voi/primitives/x25519 v0.0.0-20210505121811-294cf0fbfb43 golang.org/x/crypto/ed25519 => github.com/oasisprotocol/curve25519-voi/primitives/ed25519 v0.0.0-20210505121811-294cf0fbfb43 + ) require ( @@ -16,7 +17,7 @@ require ( github.com/btcsuite/btcutil v1.0.3-0.20201208143702-a53e38424cce github.com/cenkalti/backoff/v4 v4.2.1 github.com/cometbft/cometbft v0.0.0-00010101000000-000000000000 - github.com/cometbft/cometbft-db v0.7.0 + github.com/cometbft/cometbft-db v0.8.0 github.com/cosmos/gogoproto v1.4.1 github.com/dgraph-io/badger/v3 v3.2103.4 github.com/eapache/channels v1.1.0 @@ -33,6 +34,7 @@ require ( github.com/ipfs/go-log/v2 v2.5.1 github.com/libp2p/go-libp2p v0.30.0 github.com/libp2p/go-libp2p-pubsub v0.9.3 + github.com/linxGnu/grocksdb v1.8.4 github.com/multiformats/go-multiaddr v0.11.0 github.com/oasisprotocol/curve25519-voi v0.0.0-20230110094441-db37f07504ce github.com/oasisprotocol/deoxysii v0.0.0-20220228165953-2091330c22b7 @@ -126,7 +128,6 @@ require ( github.com/libp2p/go-netroute v0.2.1 // indirect github.com/libp2p/go-reuseport v0.4.0 // indirect github.com/libp2p/go-yamux/v4 v4.0.1 // indirect - github.com/linxGnu/grocksdb v1.8.4 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/marten-seemann/tcp v0.0.0-20210406111302-dfbc87cc63fd // indirect github.com/mattn/go-colorable v0.1.13 // indirect @@ -176,9 +177,8 @@ require ( github.com/spf13/jwalterweatherman v1.1.0 // indirect github.com/subosito/gotenv v1.4.2 // indirect github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 // indirect - github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c // indirect github.com/x448/float16 v0.8.4 // indirect - go.etcd.io/bbolt v1.3.6 // indirect + go.etcd.io/bbolt v1.3.7 // indirect go.opencensus.io v0.24.0 // indirect go.uber.org/dig v1.17.0 // indirect go.uber.org/fx v1.20.0 // indirect diff --git a/go/go.sum b/go/go.sum index 1fb1b3f5ae5..49a349d3964 100644 --- a/go/go.sum +++ b/go/go.sum @@ -108,8 +108,8 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= -github.com/cometbft/cometbft-db v0.7.0 h1:uBjbrBx4QzU0zOEnU8KxoDl18dMNgDh+zZRUE0ucsbo= -github.com/cometbft/cometbft-db v0.7.0/go.mod h1:yiKJIm2WKrt6x8Cyxtq9YTEcIMPcEe4XPxhgX59Fzf0= +github.com/cometbft/cometbft-db v0.8.0 h1:vUMDaH3ApkX8m0KZvOFFy9b5DZHBAjsnEuo9AKVZpjo= +github.com/cometbft/cometbft-db v0.8.0/go.mod h1:6ASCP4pfhmrCBpfk01/9E1SI29nD3HfVHrY4PG8x5c0= github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327/go.mod h1:ZJeTFisyysqgcCdecO57Dj79RfL0LNeGiFUqLYQRYLE= github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= @@ -173,12 +173,6 @@ github.com/envoyproxy/go-control-plane v0.9.7/go.mod h1:cwu0lG7PUMfa9snN8LXBig5y github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c h1:8ISkoahWXwZR41ois5lSJBSVw4D0OV19Ht/JSTzvSv0= -github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c/go.mod h1:Yg+htXGokKKdzcwhuNDwVvN+uBxDGXJ7G/VN1d8fa64= -github.com/facebookgo/stack v0.0.0-20160209184415-751773369052 h1:JWuenKqqX8nojtoVVWjGfOF9635RETekkoH6Cc9SX0A= -github.com/facebookgo/stack v0.0.0-20160209184415-751773369052/go.mod h1:UbMTZqLaRiH3MsBH8va0n7s1pQYcu3uTb8G4tygF4Zg= -github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4 h1:7HZCaLC5+BZpmbhCOZJ293Lz68O7PYrF2EzeiFMwCLk= -github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4/go.mod h1:5tD+neXqOorC30/tWg0LCSkrqj/AR6gu8yY8/fpw1q0= github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w= github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= @@ -670,8 +664,6 @@ github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNG github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 h1:epCh84lMvA70Z7CTTCmYQn2CKbY8j86K7/FAIr141uY= github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7/go.mod h1:q4W45IWZaF22tdD+VEXcAWRA037jwmWEB5VWYORlTpc= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA= -github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzHWCjJB1zZfXPIAaDpzXIEJ0eS6B5Ok= -github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8= github.com/thepudds/fzgo v0.2.2 h1:bGofmgAGfTLpVgETkL9jvhg6azylvCF/kW6JPy5fkzQ= github.com/thepudds/fzgo v0.2.2/go.mod h1:ZgigL1toyKrar3rWdXz7Fuv7bUpKZ4BAYN49TpEFMCI= github.com/tidwall/btree v1.6.0 h1:LDZfKfQIBHGHWSwckhXI0RPSXzlo+KYdjK7FWSqOzzg= @@ -690,8 +682,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU= -go.etcd.io/bbolt v1.3.6/go.mod h1:qXsaaIqmgQH0T+OPdb99Bf+PKfBBQVAdyD6TY9G8XM4= +go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ= +go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw= go.opencensus.io v0.18.0/go.mod h1:vKdFvxhtzZ9onBp9VKHK8z/sRpBMnKAsufL7wlDrCOA= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= @@ -887,7 +879,6 @@ golang.org/x/sys v0.0.0-20200602225109-6fdc65e7d980/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200814200057-3d37ad5750ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index 21485b8bc36..1faf734151c 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -1,3 +1,6 @@ +//go:build rocksdb +// +build rocksdb + package rocksdb import ( diff --git a/go/storage/mkvs/db/rocksdb/iterator.go b/go/storage/mkvs/db/rocksdb/iterator.go index 76e675bee42..663c5d5cff0 100644 --- a/go/storage/mkvs/db/rocksdb/iterator.go +++ b/go/storage/mkvs/db/rocksdb/iterator.go @@ -1,3 +1,6 @@ +//go:build rocksdb +// +build rocksdb + package rocksdb import ( diff --git a/go/storage/mkvs/db/rocksdb/metadata.go b/go/storage/mkvs/db/rocksdb/metadata.go index 9935d16c03b..2e11c6ffda8 100644 --- a/go/storage/mkvs/db/rocksdb/metadata.go +++ b/go/storage/mkvs/db/rocksdb/metadata.go @@ -1,3 +1,6 @@ +//go:build rocksdb +// +build rocksdb + package rocksdb import ( diff --git a/go/storage/mkvs/db/rocksdb/norocksdb.go b/go/storage/mkvs/db/rocksdb/norocksdb.go new file mode 100644 index 00000000000..086165b6db9 --- /dev/null +++ b/go/storage/mkvs/db/rocksdb/norocksdb.go @@ -0,0 +1,14 @@ +//go:build !rocksdb + +package rocksdb + +import ( + "fmt" + + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/api" +) + +// New creates a new RocksDB-backed node database. +func New(cfg *api.Config) (api.NodeDB, error) { + return nil, fmt.Errorf("mkvs/rocksdb: not compiled with RocksDB support") +} diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 2d4d7138ea9..95900c3a9d9 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -1,3 +1,6 @@ +//go:build rocksdb +// +build rocksdb + // Package rocksdb provides a RocksDB-backed node database. package rocksdb diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index 3b355ed34a4..43ceeedcd4d 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -1,3 +1,6 @@ +//go:build rocksdb +// +build rocksdb + package rocksdb import ( diff --git a/go/storage/mkvs/db/rocksdb/timestamp.go b/go/storage/mkvs/db/rocksdb/timestamp.go index 14c1c2be7ba..73c1e72c4a3 100644 --- a/go/storage/mkvs/db/rocksdb/timestamp.go +++ b/go/storage/mkvs/db/rocksdb/timestamp.go @@ -1,3 +1,6 @@ +//go:build rocksdb +// +build rocksdb + package rocksdb import ( @@ -25,7 +28,7 @@ func createTimestampComparator() *grocksdb.Comparator { ) } -// Implements gorocksdb.Comparing. +// Implements grocksdb.Comparing. func compareTimestampKeys(a, b []byte) int { // First compare keys without timestamps. if ret := compareWithoutTimestamp(a, true, b, true); ret != 0 { @@ -36,7 +39,7 @@ func compareTimestampKeys(a, b []byte) int { return -compareTimestamp(a[len(a)-timestampSize:], b[len(b)-timestampSize:]) } -// Implements gorocksdb.Comparing. +// Implements grocksdb.Comparing. func compareTimestamp(a, b []byte) int { ts1 := binary.LittleEndian.Uint64(a) ts2 := binary.LittleEndian.Uint64(b) @@ -51,7 +54,7 @@ func compareTimestamp(a, b []byte) int { } } -// Implements gorocksdb.ComparingWithoutTimestamp. +// Implements grocksdb.ComparingWithoutTimestamp. func compareWithoutTimestamp(a []byte, aHasTs bool, b []byte, bHasTs bool) int { if aHasTs { a = a[:len(a)-timestampSize] @@ -80,7 +83,7 @@ func timestampFromVersion(version uint64) [timestampSize]byte { func versionFromTimestamp(ts *grocksdb.Slice) (uint64, error) { if !ts.Exists() { - return 0, fmt.Errorf("timestamp empty") + return 0, fmt.Errorf("missing timestamp") } defer ts.Free() return binary.LittleEndian.Uint64(ts.Data()), nil From bcaf6fa5b82af29297d4721d4598362e10e7dd1b Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 14:37:14 +0100 Subject: [PATCH 05/28] oasis-core-dev/Dockerfile: install rocksdb --- docker/oasis-core-dev/Dockerfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docker/oasis-core-dev/Dockerfile b/docker/oasis-core-dev/Dockerfile index 1cea4b1cb5a..a0d687f9762 100644 --- a/docker/oasis-core-dev/Dockerfile +++ b/docker/oasis-core-dev/Dockerfile @@ -13,6 +13,7 @@ ARG GOIMPORTS_VERSION=v0.12.0 ARG RUST_NIGHTLY_VERSION=2023-01-16 ARG JEMALLOC_VERSION=5.2.1 ARG JEMALLOC_CHECKSUM=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6 +ARG ROCKSDB_VERSION=8.5.3 # Legacy package versions (upgrade tests). ARG LEGACY_GO_VERSION=1.20.2 @@ -81,6 +82,18 @@ RUN wget https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz && \ go install mvdan.cc/gofumpt@${GOFUMPT_VERSION} && \ go install golang.org/x/tools/cmd/goimports@${GOIMPORTS_VERSION} +# Install RocksDB. +RUN \ + wget -q https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz \ + # Ensure checksum matches. TODO + && tar -zxf v${ROCKSDB_VERSION}.tar.gz \ + && cd rocksdb-${ROCKSDB_VERSION} \ + && DEBUG_LEVEL=0 make -j4 shared_lib \ + && make install-shared \ + && ldconfig \ + && cd .. \ + && rm -rf v${ROCKSDB}.tar.gz rocksdb-${ROCKSDB} + # Install jemalloc (used by BadgerDB). RUN wget -O jemalloc.tar.bz2 \ https://github.com/jemalloc/jemalloc/releases/download/${JEMALLOC_VERSION}/jemalloc-${JEMALLOC_VERSION}.tar.bz2 && \ From e37d95df8677f1a9dff25626b11a6c5b8494deef Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 14:51:28 +0100 Subject: [PATCH 06/28] testing: use rocksdb everywhere --- .github/dependabot.yml | 2 ++ go/consensus/cometbft/abci/state.go | 1 + go/go.mod | 1 - go/oasis-node/cmd/debug/byzantine/storage_node.go | 4 ++-- go/oasis-test-runner/oasis/compute.go | 2 +- go/worker/storage/config/config.go | 4 ++-- go/worker/storage/crashing_test.go | 2 +- go/worker/storage/init.go | 3 +++ 8 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index f033db1a06e..489dc637500 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -30,6 +30,8 @@ updates: # CometBFT is manualy kept up to date. - dependency-name: github.com/cometbft/cometbft - dependency-name: github.com/cometbft/cometbft-db + # RocksDB is manualy kept up to date. + - dependency-name: github.com/linxGnu/grocksdb # Manage Rust pacakge versions. - package-ecosystem: cargo diff --git a/go/consensus/cometbft/abci/state.go b/go/consensus/cometbft/abci/state.go index 88c993fe8a4..04ab8e5474d 100644 --- a/go/consensus/cometbft/abci/state.go +++ b/go/consensus/cometbft/abci/state.go @@ -638,6 +638,7 @@ func InitStateStorage(cfg *ApplicationConfig) (storage.LocalBackend, storage.Nod switch cfg.StorageBackend { case storageDB.BackendNameBadgerDB: + case storageDB.BackendNameRocksDB: default: return nil, nil, nil, fmt.Errorf("unsupported storage backend: %s", cfg.StorageBackend) } diff --git a/go/go.mod b/go/go.mod index 609bb7ab089..6c99a61c15f 100644 --- a/go/go.mod +++ b/go/go.mod @@ -9,7 +9,6 @@ replace ( golang.org/x/crypto/curve25519 => github.com/oasisprotocol/curve25519-voi/primitives/x25519 v0.0.0-20210505121811-294cf0fbfb43 golang.org/x/crypto/ed25519 => github.com/oasisprotocol/curve25519-voi/primitives/ed25519 v0.0.0-20210505121811-294cf0fbfb43 - ) require ( diff --git a/go/oasis-node/cmd/debug/byzantine/storage_node.go b/go/oasis-node/cmd/debug/byzantine/storage_node.go index 80c64093c4d..515ea7c7077 100644 --- a/go/oasis-node/cmd/debug/byzantine/storage_node.go +++ b/go/oasis-node/cmd/debug/byzantine/storage_node.go @@ -47,8 +47,8 @@ func newStorageNode(namespace common.Namespace, datadir string) (*storageWorker, defer close(initCh) cfg := &storage.Config{ - Backend: database.BackendNameBadgerDB, - DB: filepath.Join(datadir, database.DefaultFileName(database.BackendNameBadgerDB)), + Backend: database.BackendNameRocksDB, + DB: filepath.Join(datadir, database.DefaultFileName(database.BackendNameRocksDB)), Namespace: namespace, MaxCacheSize: 64 * 1024 * 1024, } diff --git a/go/oasis-test-runner/oasis/compute.go b/go/oasis-test-runner/oasis/compute.go index fc31603072e..a50de277b56 100644 --- a/go/oasis-test-runner/oasis/compute.go +++ b/go/oasis-test-runner/oasis/compute.go @@ -218,7 +218,7 @@ func (net *Network) NewCompute(cfg *ComputeCfg) (*Compute, error) { cfg.RuntimeProvisioner = runtimeConfig.RuntimeProvisionerSandboxed } if cfg.StorageBackend == "" { - cfg.StorageBackend = database.BackendNameBadgerDB + cfg.StorageBackend = database.BackendNameRocksDB } // Initialize runtime state paths. for i, path := range cfg.RuntimeStatePaths { diff --git a/go/worker/storage/config/config.go b/go/worker/storage/config/config.go index 46821c43f45..ea3ded24083 100644 --- a/go/worker/storage/config/config.go +++ b/go/worker/storage/config/config.go @@ -34,7 +34,7 @@ type CheckpointerConfig struct { // Validate validates the configuration settings. func (c *Config) Validate() error { - if c.Backend != "badger" { + if c.Backend != "badger" && c.Backend != "rocksdb" { return fmt.Errorf("unknown storage backend: %s", c.Backend) } @@ -44,7 +44,7 @@ func (c *Config) Validate() error { // DefaultConfig returns the default configuration settings. func DefaultConfig() Config { return Config{ - Backend: "badger", + Backend: "rocksdb", MaxCacheSize: "64mb", FetcherCount: 4, PublicRPCEnabled: false, diff --git a/go/worker/storage/crashing_test.go b/go/worker/storage/crashing_test.go index e0fdb8fcd46..d5389020a68 100644 --- a/go/worker/storage/crashing_test.go +++ b/go/worker/storage/crashing_test.go @@ -21,7 +21,7 @@ func TestCrashingBackendDoNotInterfere(t *testing.T) { var ( cfg = api.Config{ - Backend: database.BackendNameBadgerDB, + Backend: database.BackendNameRocksDB, Namespace: testNs, MaxCacheSize: 16 * 1024 * 1024, } diff --git a/go/worker/storage/init.go b/go/worker/storage/init.go index 6678f5e436c..be46d401592 100644 --- a/go/worker/storage/init.go +++ b/go/worker/storage/init.go @@ -47,6 +47,9 @@ func NewLocalBackend( case database.BackendNameBadgerDB: cfg.DB = GetLocalBackendDBDir(dataDir, cfg.Backend) impl, err = database.New(cfg) + case database.BackendNameRocksDB: + cfg.DB = GetLocalBackendDBDir(dataDir, cfg.Backend) + impl, err = database.New(cfg) default: err = fmt.Errorf("storage: unsupported backend: '%v'", cfg.Backend) } From c279ddccaa27a42364fd8b6eaae40b6c4244df1f Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 17:23:52 +0100 Subject: [PATCH 07/28] test build fixes --- .goreleaser.yml | 12 ++++++++---- go/Makefile | 1 + go/storage/mkvs/db/rocksdb/norocksdb.go | 2 +- go/storage/mkvs/db/rocksdb/rocksdb_test.go | 2 -- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.goreleaser.yml b/.goreleaser.yml index 07e7d15a47d..bfd6adfeb75 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -18,10 +18,14 @@ builds: dir: go/ flags: - -trimpath - # Build oasis-node with jemalloc tag (used by badgerdb). - # TODO: Use 'tags' attribute when GoReleaser is udpated to newer version: - # https://github.com/goreleaser/goreleaser/pull/2268 - - -tags=jemalloc + tags: + # Build BadgerDB with jemalloc. + - jemalloc + # Build RocksDB storage backend. + - rocksdb + - grocksdb_clean_link + env: + - CGO_ENABLED=1 ldflags: # NOTE: At the moment, GoReleaser produces different binaries when # releases are built from different git paths, unless -buildid= is added diff --git a/go/Makefile b/go/Makefile index afc65563d3e..8f83cd30bc1 100644 --- a/go/Makefile +++ b/go/Makefile @@ -27,6 +27,7 @@ endif # If GO_TAGS is not empty, append it to GO_EXTRA_FLAGS. ifneq ($(GO_TAGS),) GO_EXTRA_FLAGS += -tags "$(GO_TAGS)" + GO_TEST_FLAGS += -tags "$(GO_TAGS)" endif # Set all target as the default target. diff --git a/go/storage/mkvs/db/rocksdb/norocksdb.go b/go/storage/mkvs/db/rocksdb/norocksdb.go index 086165b6db9..24c2c90bcb3 100644 --- a/go/storage/mkvs/db/rocksdb/norocksdb.go +++ b/go/storage/mkvs/db/rocksdb/norocksdb.go @@ -9,6 +9,6 @@ import ( ) // New creates a new RocksDB-backed node database. -func New(cfg *api.Config) (api.NodeDB, error) { +func New(_ *api.Config) (api.NodeDB, error) { return nil, fmt.Errorf("mkvs/rocksdb: not compiled with RocksDB support") } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index 43ceeedcd4d..a14e806b928 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -134,7 +134,6 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui } checkNodes := func(cf *grocksdb.ColumnFamilyHandle) { - fmt.Println("checking nodes") it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), cf), nil) defer it.Close() for ; it.Valid(); it.Next() { @@ -148,7 +147,6 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui delete(notVisited, string(key)) } } - fmt.Println("Verify nodes.....") checkNodes(rocksdb.cfIOTree) checkNodes(rocksdb.cfStateTree) From 7084d830986707f7f99dfb9a3f6a65bd84449b01 Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 18:05:28 +0100 Subject: [PATCH 08/28] storage/interop/protocol: use rocksdb backend --- .buildkite/go/test_and_coverage.sh | 9 +++++---- go/storage/mkvs/db/rocksdb/rocksdb_test.go | 1 - go/storage/mkvs/interop/cmd/protocol_server.go | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/go/test_and_coverage.sh b/.buildkite/go/test_and_coverage.sh index 50986a58d55..846eea2ab91 100755 --- a/.buildkite/go/test_and_coverage.sh +++ b/.buildkite/go/test_and_coverage.sh @@ -20,26 +20,27 @@ download_artifact simple-keyvalue target/debug 755 export OASIS_TEST_RUNTIME_HOST_RUNTIME_PATH=$(pwd)/target/debug/simple-keyvalue +TAGS="jemalloc,rocksdb,grocksdb_clean_link" ##################### # Test the Oasis node ##################### pushd go make generate # We need to do multiple test passes for different parts to get correct coverage. - env -u GOPATH go test -race -coverprofile=../coverage-misc.txt -covermode=atomic -v \ + env -u GOPATH go test -tags "${TAGS}" -race -coverprofile=../coverage-misc.txt -covermode=atomic -v \ $(go list ./... | \ grep -v github.com/oasisprotocol/oasis-core/go/oasis-node | \ grep -v github.com/oasisprotocol/oasis-core/go/genesis | \ grep -v github.com/oasisprotocol/oasis-core/go/storage/mkvs ) # Oasis node tests. pushd oasis-node - env -u GOPATH go test -race -coverpkg ../... -coverprofile=../../coverage-oasis-node.txt -covermode=atomic -v ./... + env -u GOPATH go test -tags "${TAGS}" -race -coverpkg ../... -coverprofile=../../coverage-oasis-node.txt -covermode=atomic -v ./... popd pushd genesis - env -u GOPATH go test -race -coverpkg ../... -coverprofile=../../coverage-genesis.txt -covermode=atomic -v ./... + env -u GOPATH go test -tags "${TAGS}" -race -coverpkg ../... -coverprofile=../../coverage-genesis.txt -covermode=atomic -v ./... popd # MKVS tests. pushd storage/mkvs - env -u GOPATH go test -race -coverpkg ./... -coverprofile=../../../coverage-mkvs.txt -covermode=atomic -v ./... + env -u GOPATH go test -tags "${TAGS}" -race -coverpkg ./... -coverprofile=../../../coverage-mkvs.txt -covermode=atomic -v ./... popd popd diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index a14e806b928..33785d07011 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -142,7 +142,6 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui continue } _, ok := keySet[string(key)] - fmt.Println(key) require.Equal(true, ok, "unexpected node in db") delete(notVisited, string(key)) } diff --git a/go/storage/mkvs/interop/cmd/protocol_server.go b/go/storage/mkvs/interop/cmd/protocol_server.go index 161cbda9197..33017173dc6 100644 --- a/go/storage/mkvs/interop/cmd/protocol_server.go +++ b/go/storage/mkvs/interop/cmd/protocol_server.go @@ -17,7 +17,7 @@ import ( "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common/background" "github.com/oasisprotocol/oasis-core/go/storage/api" "github.com/oasisprotocol/oasis-core/go/storage/database" - badgerNodedb "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/badger" + "github.com/oasisprotocol/oasis-core/go/storage/mkvs/db/rocksdb" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/interop/fixtures" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/node" ) @@ -68,7 +68,7 @@ func doProtoServer(*cobra.Command, []string) { // Initialize a dummy storage backend. storageCfg := api.Config{ - Backend: database.BackendNameBadgerDB, + Backend: database.BackendNameRocksDB, DB: dataDir, MaxCacheSize: 16 * 1024 * 1024, } @@ -77,7 +77,7 @@ func doProtoServer(*cobra.Command, []string) { ctx := context.Background() ndbCfg := storageCfg.ToNodeDB() var ndb api.NodeDB - ndb, err = badgerNodedb.New(ndbCfg) + ndb, err = rocksdb.New(ndbCfg) if err != nil { logger.Error("failed to initialize node db", "err", err, From 9d76990aefe57bc300925a7bf2c7071a1708bb39 Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 19:48:45 +0100 Subject: [PATCH 09/28] Makefile: use all build tags in e2e coverage mode --- go/Makefile | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/go/Makefile b/go/Makefile index 8f83cd30bc1..51786fe8432 100644 --- a/go/Makefile +++ b/go/Makefile @@ -24,6 +24,14 @@ ifneq ($(OASIS_NO_ROCKSDB), 1) GO_TAGS := $(GO_TAGS)rocksdb,grocksdb_clean_link endif +# Include e2ecoverage tag if configured. +ifeq ($(GO_BUILD_E2E_COVERAGE),1) + ifneq ($(GO_TAGS),) + GO_TAGS := $(GO_TAGS), + endif + GO_TAGS := $(GO_TAGS)e2ecoverage +endif + # If GO_TAGS is not empty, append it to GO_EXTRA_FLAGS. ifneq ($(GO_TAGS),) GO_EXTRA_FLAGS += -tags "$(GO_TAGS)" @@ -49,7 +57,7 @@ $(go-binaries): @$(GO) build $(GOFLAGS) $(GO_EXTRA_FLAGS) -o ./$@/$(notdir $@) ./$@ ifeq ($(GO_BUILD_E2E_COVERAGE),1) @$(ECHO) "$(MAGENTA)*** Building $@ with E2E coverage...$(OFF)" - @$(GO) test $(GOFLAGS) -c -tags e2ecoverage -covermode=atomic -coverpkg=./... -o ./$@/$(notdir $@).test ./$@ + @$(GO) test $(GOFLAGS) -c $(GO_TEST_FLAGS) -covermode=atomic -coverpkg=./... -o ./$@/$(notdir $@).test ./$@ endif build: $(go-binaries) @@ -113,7 +121,7 @@ test: $(test-targets) # Test without caching. force-test: @$(ECHO) "$(CYAN)*** Running Go unit tests in force mode...$(OFF)" - @$(MAKE) test GO_TEST_FLAGS=-count=1 + @$(MAKE) test GO_TEST_FLAGS += -count=1 # Clean. clean: From c11839aa276ac55688bfb5600a365b9c3e53631f Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 2 Nov 2023 20:05:10 +0100 Subject: [PATCH 10/28] docker: fix build --- docker/oasis-core-dev/Dockerfile | 31 ++++++++++++++++++------------- go/Makefile | 1 + 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/docker/oasis-core-dev/Dockerfile b/docker/oasis-core-dev/Dockerfile index a0d687f9762..faa9066bcfe 100644 --- a/docker/oasis-core-dev/Dockerfile +++ b/docker/oasis-core-dev/Dockerfile @@ -14,6 +14,7 @@ ARG RUST_NIGHTLY_VERSION=2023-01-16 ARG JEMALLOC_VERSION=5.2.1 ARG JEMALLOC_CHECKSUM=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6 ARG ROCKSDB_VERSION=8.5.3 +ARG ROCKSDB_CHECKSUM=ed4230500b9ca20bc7918c32166b2d0d46a8695c59991821daa586d55689d785 # Legacy package versions (upgrade tests). ARG LEGACY_GO_VERSION=1.20.2 @@ -33,7 +34,9 @@ RUN apt-get update -qq && apt-get upgrade -qq && apt-get install -qq \ python3-prometheus-client \ # for seccomp Go bindings support libseccomp-dev \ - bubblewrap && \ + bubblewrap \ + # Compression libs for RocksDB. + libsnappy-dev libbz2-dev liblz4-dev libzstd-dev && \ apt-get autoclean && apt-get autoremove && rm -rf /var/cache/apt/archives/* && \ # for linting Git commits pip install gitlint @@ -82,18 +85,6 @@ RUN wget https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz && \ go install mvdan.cc/gofumpt@${GOFUMPT_VERSION} && \ go install golang.org/x/tools/cmd/goimports@${GOIMPORTS_VERSION} -# Install RocksDB. -RUN \ - wget -q https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz \ - # Ensure checksum matches. TODO - && tar -zxf v${ROCKSDB_VERSION}.tar.gz \ - && cd rocksdb-${ROCKSDB_VERSION} \ - && DEBUG_LEVEL=0 make -j4 shared_lib \ - && make install-shared \ - && ldconfig \ - && cd .. \ - && rm -rf v${ROCKSDB}.tar.gz rocksdb-${ROCKSDB} - # Install jemalloc (used by BadgerDB). RUN wget -O jemalloc.tar.bz2 \ https://github.com/jemalloc/jemalloc/releases/download/${JEMALLOC_VERSION}/jemalloc-${JEMALLOC_VERSION}.tar.bz2 && \ @@ -108,3 +99,17 @@ RUN wget -O jemalloc.tar.bz2 \ make && \ make install && \ cd .. && rm jemalloc.tar.bz2 && rm -rf jemalloc-${JEMALLOC_VERSION} + +# Install RocksDB. +RUN wget -q https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz && \ + # Ensure checksum matches. + echo "${ROCKSDB_CHECKSUM} v${ROCKSDB_VERSION}.tar.gz" | sha256sum -c && \ + tar -zxf v${ROCKSDB_VERSION}.tar.gz && \ + cd rocksdb-${ROCKSDB_VERSION} && \ + # TODO: clashes with jemalloc used by BadgerDB. + # For 64-bit x86 the `PORTABLE=haswell` is a reasonable compromise, which supports many or most + # of the available optimizations while still being compatible with most processors made since + # roughly 2013. https://github.com/facebook/rocksdb/blob/main/INSTALL.md + DEBUG_LEVEL=0 ROCKSDB_DISABLE_JEMALLOC=1 PORTABLE=haswell make -j4 shared_lib && \ + make install-shared && ldconfig && \ + cd .. && rm -rf v${ROCKSDB_VERSION}.tar.gz rocksdb-${ROCKSDB_VERSION} diff --git a/go/Makefile b/go/Makefile index 51786fe8432..6dacde631fb 100644 --- a/go/Makefile +++ b/go/Makefile @@ -6,6 +6,7 @@ ifneq ($(GOLDFLAGS),) endif # Initialize GO_TAGS variable to hold the build tags. +# TODO: actually only oasis-node needs to be built with these, other binaries not. GO_TAGS := # Include jemalloc tag unless explicitly disabled (used by badgerdb). From c808f3a3fdb51df7beb39bf359360cf6acd5d717 Mon Sep 17 00:00:00 2001 From: ptrus Date: Fri, 3 Nov 2023 14:43:54 +0100 Subject: [PATCH 11/28] debug/{dumpdb,storage}: support rocksdb backend --- go/oasis-node/cmd/debug/dumpdb/dumpdb.go | 13 +++++++++++-- go/oasis-node/cmd/debug/storage/export.go | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/go/oasis-node/cmd/debug/dumpdb/dumpdb.go b/go/oasis-node/cmd/debug/dumpdb/dumpdb.go index 61b2e487af8..01c913d6248 100644 --- a/go/oasis-node/cmd/debug/dumpdb/dumpdb.go +++ b/go/oasis-node/cmd/debug/dumpdb/dumpdb.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "time" "github.com/spf13/cobra" @@ -14,6 +15,7 @@ import ( beacon "github.com/oasisprotocol/oasis-core/go/beacon/api" "github.com/oasisprotocol/oasis-core/go/common/logging" + "github.com/oasisprotocol/oasis-core/go/config" "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/abci" abciState "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/abci/state" cmtAPI "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/api" @@ -37,7 +39,7 @@ import ( scheduler "github.com/oasisprotocol/oasis-core/go/scheduler/api" staking "github.com/oasisprotocol/oasis-core/go/staking/api" storage "github.com/oasisprotocol/oasis-core/go/storage/api" - storageDB "github.com/oasisprotocol/oasis-core/go/storage/database" + storageDatabase "github.com/oasisprotocol/oasis-core/go/storage/database" "github.com/oasisprotocol/oasis-core/go/storage/mkvs/checkpoint" ) @@ -101,11 +103,18 @@ func doDumpDB(cmd *cobra.Command, _ []string) { // read-only mode because it needs to truncate the value log. // // Hope you have backups if you ever run into this. + b := strings.ToLower(config.GlobalConfig.Storage.Backend) + switch b { + case storageDatabase.BackendNameBadgerDB, storageDatabase.BackendNameRocksDB: + default: + logger.Error("unsupported storage backend", "backend", b) + return + } ctx := context.Background() ldb, _, stateRoot, err := abci.InitStateStorage( &abci.ApplicationConfig{ DataDir: filepath.Join(dataDir, cmtCommon.StateDir), - StorageBackend: storageDB.BackendNameBadgerDB, // No other backend for now. + StorageBackend: b, MemoryOnlyStorage: false, ReadOnlyStorage: viper.GetBool(cfgDumpReadOnlyDB), DisableCheckpointer: true, diff --git a/go/oasis-node/cmd/debug/storage/export.go b/go/oasis-node/cmd/debug/storage/export.go index 8ad848cbd04..d6394f7f8dd 100644 --- a/go/oasis-node/cmd/debug/storage/export.go +++ b/go/oasis-node/cmd/debug/storage/export.go @@ -167,7 +167,7 @@ func newDirectStorageBackend(dataDir string, namespace common.Namespace) (storag } switch b { - case storageDatabase.BackendNameBadgerDB: + case storageDatabase.BackendNameBadgerDB, storageDatabase.BackendNameRocksDB: cfg.DB = filepath.Join(cfg.DB, storageDatabase.DefaultFileName(cfg.Backend)) return storageDatabase.New(cfg) default: From 15bb10d4bed79b400fb87a0766443ca1bc07854c Mon Sep 17 00:00:00 2001 From: ptrus Date: Sat, 4 Nov 2023 15:35:53 +0100 Subject: [PATCH 12/28] Don't harcdode badger in abci.ApplicationConfig --- go/consensus/cometbft/db/init.go | 5 ----- go/consensus/cometbft/full/archive.go | 2 +- go/consensus/cometbft/full/full.go | 2 +- go/oasis-node/node_test.go | 2 +- go/worker/storage/init.go | 5 +---- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/go/consensus/cometbft/db/init.go b/go/consensus/cometbft/db/init.go index 167c7dcfb0a..0fffd6b57ad 100644 --- a/go/consensus/cometbft/db/init.go +++ b/go/consensus/cometbft/db/init.go @@ -8,11 +8,6 @@ import ( "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/db/badger" ) -// GetBackendName returns the currently configured CometBFT database backend. -func GetBackendName() string { - return badger.BackendName -} - // GetProvider returns the currently configured CometBFT DBProvider. func GetProvider() (node.DBProvider, error) { return badger.DBProvider, nil diff --git a/go/consensus/cometbft/full/archive.go b/go/consensus/cometbft/full/archive.go index 44eaf47793b..f3eb97eb2f0 100644 --- a/go/consensus/cometbft/full/archive.go +++ b/go/consensus/cometbft/full/archive.go @@ -160,7 +160,7 @@ func NewArchive( appConfig := &abci.ApplicationConfig{ DataDir: filepath.Join(srv.dataDir, tmcommon.StateDir), - StorageBackend: db.GetBackendName(), + StorageBackend: config.GlobalConfig.Storage.Backend, Pruning: abci.PruneConfig{ Strategy: abci.PruneNone, PruneInterval: time.Hour * 100, // Irrelevant as pruning is disabled. diff --git a/go/consensus/cometbft/full/full.go b/go/consensus/cometbft/full/full.go index ef3596053ff..0286555f7fb 100644 --- a/go/consensus/cometbft/full/full.go +++ b/go/consensus/cometbft/full/full.go @@ -533,7 +533,7 @@ func (t *fullService) lazyInit() error { // nolint: gocyclo appConfig := &abci.ApplicationConfig{ DataDir: filepath.Join(t.dataDir, tmcommon.StateDir), - StorageBackend: db.GetBackendName(), + StorageBackend: config.GlobalConfig.Storage.Backend, Pruning: pruneCfg, HaltEpoch: beaconAPI.EpochTime(config.GlobalConfig.Consensus.HaltEpoch), HaltHeight: config.GlobalConfig.Consensus.HaltHeight, diff --git a/go/oasis-node/node_test.go b/go/oasis-node/node_test.go index aa625f9821b..fb1fc0bdb56 100644 --- a/go/oasis-node/node_test.go +++ b/go/oasis-node/node_test.go @@ -141,7 +141,7 @@ func newTestNode(t *testing.T) *testNode { config.GlobalConfig.Common.Debug.AllowRoot = true config.GlobalConfig.Mode = config.ModeCompute config.GlobalConfig.Runtime.Provisioner = runtimeConfig.RuntimeProvisionerMock - config.GlobalConfig.Storage.Backend = "badger" + config.GlobalConfig.Storage.Backend = "rocksdb" config.GlobalConfig.Storage.PublicRPCEnabled = true config.GlobalConfig.Consensus.ListenAddress = "tcp://0.0.0.0:27565" config.GlobalConfig.Consensus.SupplementarySanity.Enabled = true diff --git a/go/worker/storage/init.go b/go/worker/storage/init.go index be46d401592..6def3877938 100644 --- a/go/worker/storage/init.go +++ b/go/worker/storage/init.go @@ -44,10 +44,7 @@ func NewLocalBackend( impl api.LocalBackend ) switch cfg.Backend { - case database.BackendNameBadgerDB: - cfg.DB = GetLocalBackendDBDir(dataDir, cfg.Backend) - impl, err = database.New(cfg) - case database.BackendNameRocksDB: + case database.BackendNameBadgerDB, database.BackendNameRocksDB: cfg.DB = GetLocalBackendDBDir(dataDir, cfg.Backend) impl, err = database.New(cfg) default: From 09107e8d6291d6082f4ef48ff4d9f3df61974f02 Mon Sep 17 00:00:00 2001 From: ptrus Date: Tue, 7 Nov 2023 09:20:29 +0100 Subject: [PATCH 13/28] docs: initial rocksdb build documentation --- docs/development-setup/prerequisites.md | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/development-setup/prerequisites.md b/docs/development-setup/prerequisites.md index 323bf75ddf5..4b39b0efb1f 100644 --- a/docs/development-setup/prerequisites.md +++ b/docs/development-setup/prerequisites.md @@ -210,6 +210,34 @@ Core: (i.e. you can't use `./configure --prefix=$HOME/.local ...`) because upstream authors [hardcode its path][jemalloc-hardcode-path]._ +* (**OPTIONAL**) [rocksdb] (version 8.5.3) + + # TODO: investigate clashing with jemalloc built above. + Alternatively set `OASIS_NO_ROCKSDB="1"` environment variable when building + `oasis-node` code, to build `oasis-node` without `rocksdb` support. + + See official instructions on building `rocksdb` for your system: https://github.com/facebook/rocksdb/blob/main/INSTALL.md + + Or use the following to build (non-portable) `rocksdb` on Ubuntu 22.04: + ``` + # Install prerequsites. + apt install libgflags-dev libsnappy-dev libbz2-dev liblz4-dev libzstd-dev + # Build RocksDB. + ROCKSDB_VERSION=8.5.3 + ROCKSDB_CHECKSUM=ed4230500b9ca20bc7918c32166b2d0d46a8695c59991821daa586d55689d785 + pushd $(mktemp -d) + wget -O rocksdb.tar.gz \ + https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz + # Ensure checksum matches. + echo "${ROCKSDB_CHECKSUM} rocksdb.tar.gz" | sha256sum -c + tar -zxf rocksdb.tar.gz + cd rocksdb-${ROCKSDB_VERSION} + DEBUG_LEVEL=0 ROCKSDB_DISABLE_JEMALLOC=1 make -j4 shared_lib + sudo make install-shared + sudo ldconfig + popd + ``` + In the following instructions, the top-level directory is the directory where the code has been checked out. From b29d88fb6a36bc5d3f648fa7b81552328eaf3ea1 Mon Sep 17 00:00:00 2001 From: ptrus Date: Tue, 7 Nov 2023 09:21:13 +0100 Subject: [PATCH 14/28] storage/rocksdb: ensure WriteBatch is destroyed --- go/storage/mkvs/db/rocksdb/batch.go | 4 ++++ go/storage/mkvs/db/rocksdb/rocksdb.go | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index 1faf734151c..88fc46490cc 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -37,6 +37,10 @@ type rocksdbBatch struct { // Commit implements api.Batch. func (ba *rocksdbBatch) Commit(root node.Root) error { + defer ba.bat.Destroy() + if ba.multipartNodes != nil { + defer ba.multipartNodes.Destroy() + } ba.db.metaUpdateLock.Lock() defer ba.db.metaUpdateLock.Unlock() diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 95900c3a9d9..34a434af5fa 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -463,10 +463,13 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node } item, err := d.db.GetCF(timestampReadOptions(endRoot.Version), cf, key) - if err != nil || !item.Exists() { + if err != nil { return node.Root{}, nil, err } defer item.Free() + if !item.Exists() { + return node.Root{}, nil, err + } var log api.HashedDBWriteLog if err := cbor.UnmarshalTrusted(item.Data(), &log); err != nil { @@ -749,6 +752,7 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { } batch := grocksdb.NewWriteBatch() + defer batch.Destroy() for rootHash, derivedRoots := range rootsMeta.Roots { if len(derivedRoots) > 0 { // Not a lone root. From 2d0787fca1e171faf83ddfa19134b43d9066c7ef Mon Sep 17 00:00:00 2001 From: ptrus Date: Tue, 7 Nov 2023 14:33:11 +0100 Subject: [PATCH 15/28] rocksdb: ensure read options are destroyed --- go/storage/mkvs/db/rocksdb/batch.go | 19 ++++-- go/storage/mkvs/db/rocksdb/rocksdb.go | 74 +++++++++++++--------- go/storage/mkvs/db/rocksdb/rocksdb_test.go | 36 ++++++----- go/storage/mkvs/db/rocksdb/timestamp.go | 8 +++ 4 files changed, 85 insertions(+), 52 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index 88fc46490cc..dbe4c298319 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -224,15 +224,20 @@ func (s *rocksdbSubtree) PutNode(_ node.Depth, ptr *node.Pointer) error { s.batch.updatedNodes = append(s.batch.updatedNodes, updatedNode{Hash: h}) nodeKey := nodeKeyFmt.Encode(&h) if s.batch.multipartNodes != nil { - item, err := s.batch.db.db.GetCF(timestampReadOptions(s.batch.version), cf, nodeKey) - if err != nil { + if err = withTimestampRead(s.batch.version, func(readOpts *grocksdb.ReadOptions) error { + item, err := s.batch.db.db.GetCF(readOpts, cf, nodeKey) + if err != nil { + return err + } + defer item.Free() + if !item.Exists() { + th := node.TypedHashFromParts(s.batch.rootType, h) + s.batch.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&th), []byte{}) + } + return nil + }); err != nil { return err } - defer item.Free() - if !item.Exists() { - th := node.TypedHashFromParts(s.batch.rootType, h) - s.batch.multipartNodes.Put(multipartRestoreNodeLogKeyFmt.Encode(&th), []byte{}) - } } ts := timestampFromVersion(s.batch.version) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 34a434af5fa..a1d40838cf7 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -310,18 +310,20 @@ func (d *rocksdbNodeDB) checkRoot(root node.Root) error { rootHash := node.TypedHashFromRoot(root) cf := d.getColumnFamilyForRoot(root) - s, err := d.db.GetCF(timestampReadOptions(root.Version), cf, rootNodeKeyFmt.Encode(&rootHash)) - if err != nil { - d.logger.Error("failed to check root existence", - "err", err, - ) - return fmt.Errorf("mkvs/rocksdb: failed to get root from backing store: %w", err) - } - defer s.Free() - if !s.Exists() { - return api.ErrRootNotFound - } - return nil + return withTimestampRead(root.Version, func(readOpts *grocksdb.ReadOptions) error { + s, err := d.db.GetCF(readOpts, cf, rootNodeKeyFmt.Encode(&rootHash)) + if err != nil { + d.logger.Error("failed to check root existence", + "err", err, + ) + return fmt.Errorf("mkvs/rocksdb: failed to get root from backing store: %w", err) + } + defer s.Free() + if !s.Exists() { + return api.ErrRootNotFound + } + return nil + }) } // Implements api.NodeDB. @@ -345,19 +347,23 @@ func (d *rocksdbNodeDB) GetNode(root node.Root, ptr *node.Pointer) (node.Node, e } cf := d.getColumnFamilyForRoot(root) - s, err := d.db.GetCF(timestampReadOptions(root.Version), cf, nodeKeyFmt.Encode(&ptr.Hash)) - if err != nil { - return nil, fmt.Errorf("mkvs/rocksdb: failed to get node from backing store: %w", err) - } - defer s.Free() - if !s.Exists() { - return nil, api.ErrNodeNotFound - } - var n node.Node - n, err = node.UnmarshalBinary(s.Data()) - if err != nil { - return nil, fmt.Errorf("mkvs/rocksdb: failed to unmarshal node: %w", err) + if err := withTimestampRead(root.Version, func(readOpts *grocksdb.ReadOptions) error { + s, err := d.db.GetCF(readOpts, cf, nodeKeyFmt.Encode(&ptr.Hash)) + if err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to get node from backing store: %w", err) + } + defer s.Free() + if !s.Exists() { + return api.ErrNodeNotFound + } + n, err = node.UnmarshalBinary(s.Data()) + if err != nil { + return fmt.Errorf("mkvs/rocksdb: failed to unmarshal node: %w", err) + } + return nil + }); err != nil { + return nil, err } return n, nil @@ -418,7 +424,9 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node wl, err := func() (writelog.Iterator, error) { // Iterate over all write logs that result in the current item. prefix := writeLogKeyFmt.Encode(endRoot.Version, &curItem.endRootHash) - it := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(endRoot.Version), cf), prefix) + ro := timestampReadOptions(endRoot.Version) + defer ro.Destroy() + it := prefixIterator(d.db.NewIteratorCF(ro, cf), prefix) defer it.Close() for ; it.Valid(); it.Next() { @@ -462,7 +470,9 @@ func (d *rocksdbNodeDB) GetWriteLog(ctx context.Context, startRoot, endRoot node Hash: nextItem.logRoots[index].Hash(), } - item, err := d.db.GetCF(timestampReadOptions(endRoot.Version), cf, key) + ro := timestampReadOptions(endRoot.Version) + defer ro.Destroy() + item, err := d.db.GetCF(ro, cf, key) if err != nil { return node.Root{}, nil, err } @@ -678,7 +688,9 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo if err = func() error { cf := d.getColumnFamilyForType(rootHash.Type()) rootWriteLogsPrefix := writeLogKeyFmt.Encode(version, &rootHash) - wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), cf), rootWriteLogsPrefix) + ro := timestampReadOptions(version) + defer ro.Destroy() + wit := prefixIterator(d.db.NewIteratorCF(ro, cf), rootWriteLogsPrefix) defer wit.Close() for ; wit.Valid(); wit.Next() { @@ -771,7 +783,9 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { h := n.GetHash() cf := d.getColumnFamilyForRoot(root) - s, ts, err := d.db.GetCFWithTS(timestampReadOptions(root.Version), cf, nodeKeyFmt.Encode(&h)) + itRo := timestampReadOptions(root.Version) + defer itRo.Destroy() + s, ts, err := d.db.GetCFWithTS(itRo, cf, nodeKeyFmt.Encode(&h)) if err != nil { return false } @@ -803,7 +817,9 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { // Prune all write logs in version. if !d.discardWriteLogs { discardLogs := func(cf *grocksdb.ColumnFamilyHandle) { - wit := prefixIterator(d.db.NewIteratorCF(timestampReadOptions(version), cf), writeLogKeyFmt.Encode(version)) + ro := timestampReadOptions(version) + defer ro.Destroy() + wit := prefixIterator(d.db.NewIteratorCF(ro, cf), writeLogKeyFmt.Encode(version)) defer wit.Close() for ; wit.Valid(); wit.Next() { diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index 33785d07011..567d2f2cdd9 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -113,13 +113,15 @@ func createCheckpoint(ctx context.Context, require *require.Assertions, dir stri nodeKeys := keySet{} loadNodes := func(cf *grocksdb.ColumnFamilyHandle) { - it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(2), cf), nil) - defer it.Close() - for ; it.Valid(); it.Next() { - if bytes.HasPrefix(it.Key(), nodePrefix) { - nodeKeys[string(it.Key())] = struct{}{} + withTimestampRead(2, func(readOpts *grocksdb.ReadOptions) error { + it := prefixIterator(rocksdb.db.NewIteratorCF(readOpts, cf), nil) + defer it.Close() + for ; it.Valid(); it.Next() { + if bytes.HasPrefix(it.Key(), nodePrefix) { + nodeKeys[string(it.Key())] = struct{}{} + } } - } + }) } loadNodes(rocksdb.cfIOTree) loadNodes(rocksdb.cfStateTree) @@ -134,17 +136,19 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui } checkNodes := func(cf *grocksdb.ColumnFamilyHandle) { - it := prefixIterator(rocksdb.db.NewIteratorCF(timestampReadOptions(version), cf), nil) - defer it.Close() - for ; it.Valid(); it.Next() { - key := it.Key() - if !bytes.HasPrefix(key, nodePrefix) { - continue + withTimestampRead(version, func(readOpts *grocksdb.ReadOptions) error { + it := prefixIterator(rocksdb.db.NewIteratorCF(readOpts, cf), nil) + defer it.Close() + for ; it.Valid(); it.Next() { + key := it.Key() + if !bytes.HasPrefix(key, nodePrefix) { + continue + } + _, ok := keySet[string(key)] + require.Equal(true, ok, "unexpected node in db") + delete(notVisited, string(key)) } - _, ok := keySet[string(key)] - require.Equal(true, ok, "unexpected node in db") - delete(notVisited, string(key)) - } + }) } checkNodes(rocksdb.cfIOTree) checkNodes(rocksdb.cfStateTree) diff --git a/go/storage/mkvs/db/rocksdb/timestamp.go b/go/storage/mkvs/db/rocksdb/timestamp.go index 73c1e72c4a3..56a7b7e5a51 100644 --- a/go/storage/mkvs/db/rocksdb/timestamp.go +++ b/go/storage/mkvs/db/rocksdb/timestamp.go @@ -75,6 +75,14 @@ func timestampReadOptions(version uint64) *grocksdb.ReadOptions { return readOpts } +// withTimestampRead executes the given function with the timestamp read options and destroys them. +func withTimestampRead(version uint64, fn func(*grocksdb.ReadOptions) error) error { + readOpts := timestampReadOptions(version) + defer readOpts.Destroy() + + return fn(readOpts) +} + func timestampFromVersion(version uint64) [timestampSize]byte { var ts [timestampSize]byte binary.LittleEndian.PutUint64(ts[:], version) From a02efbf702d9914ffbd2633c55174480df7a8c2b Mon Sep 17 00:00:00 2001 From: ptrus Date: Wed, 8 Nov 2023 09:23:30 +0100 Subject: [PATCH 16/28] rocksdb: use fixed fork --- go/go.mod | 3 +++ go/go.sum | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/go/go.mod b/go/go.mod index 6c99a61c15f..42af81780f6 100644 --- a/go/go.mod +++ b/go/go.mod @@ -1,7 +1,10 @@ module github.com/oasisprotocol/oasis-core/go replace ( + github.com/cometbft/cometbft => github.com/oasisprotocol/cometbft v0.37.2-oasis1 + // Temp. + github.com/linxGnu/grocksdb => github.com/ptrus/grocksdb v0.0.0-20231108081122-7a5267edc303 // v1.5.0 has broken uint parsing, use my commit with fixes instead until // the maintainers merge my PR: https://github.com/spf13/cast/pull/144 diff --git a/go/go.sum b/go/go.sum index 49a349d3964..109b275c849 100644 --- a/go/go.sum +++ b/go/go.sum @@ -417,8 +417,6 @@ github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQsc github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5wwmtQP1YB4= -github.com/linxGnu/grocksdb v1.8.4 h1:ZMsBpPpJNtRLHiKKp0mI7gW+NT4s7UgfD5xHxx1jVRo= -github.com/linxGnu/grocksdb v1.8.4/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= @@ -571,6 +569,8 @@ github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= +github.com/ptrus/grocksdb v0.0.0-20231108081122-7a5267edc303 h1:nFxsXsak9DFLHjhEAZtJYCYcHI5Zp49LtaxNgVjPpIg= +github.com/ptrus/grocksdb v0.0.0-20231108081122-7a5267edc303/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/quic-go/qpack v0.4.0 h1:Cr9BXA1sQS2SmDUWjSofMPNKmvF6IiIfDRmgU0w1ZCo= github.com/quic-go/qpack v0.4.0/go.mod h1:UZVnYIfi5GRk+zI9UMaCPsmZ2xKJP7XBUvVyT1Knj9A= github.com/quic-go/qtls-go1-20 v0.3.2 h1:rRgN3WfnKbyik4dBV8A6girlJVxGand/d+jVKbQq5GI= From eb61f19118e223f874c8290d5f81f4be8cd9e790 Mon Sep 17 00:00:00 2001 From: ptrus Date: Wed, 8 Nov 2023 12:09:22 +0100 Subject: [PATCH 17/28] rocksdb: reuse flush options --- go/storage/mkvs/db/rocksdb/rocksdb.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index a1d40838cf7..a30bec53cd8 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -87,6 +87,7 @@ var ( var ( defaultWriteOptions = grocksdb.NewDefaultWriteOptions() defaultReadOptions = grocksdb.NewDefaultReadOptions() + defaultFlushOptions = grocksdb.NewDefaultFlushOptions() ) const ( @@ -989,8 +990,7 @@ func (d *rocksdbNodeDB) Size() (uint64, error) { } func (d *rocksdbNodeDB) Sync() error { - opts := grocksdb.NewDefaultFlushOptions() - return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfIOTree, d.cfStateTree}, opts) + return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfIOTree, d.cfStateTree}, defaultFlushOptions) } func (d *rocksdbNodeDB) Close() { From 1beb7b864f52cd2ded8c74b61682839a4e88de35 Mon Sep 17 00:00:00 2001 From: ptrus Date: Wed, 8 Nov 2023 14:24:48 +0100 Subject: [PATCH 18/28] rocksdb: update used options --- go/storage/mkvs/db/rocksdb/rocksdb.go | 59 ++++++++++++++++----------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index a30bec53cd8..1835c4ba322 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -105,13 +105,14 @@ func New(cfg *api.Config) (api.NodeDB, error) { readOnly: cfg.ReadOnly, } - // XXX: Most of these were taken from Cosmos-SDK RocksDB impl. - // Experiment/modify if needed. Most of these can be adjusted - // on a live database. - // Also see: https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide + // XXX: The options bellow were taken from a combination of: + // - Cosmos-SDK RocksDB implementation + // - https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide + // - https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning + // Experiment/modify if needed. // Create options for the metadata column family. - // TODO: Consider some tuning for meta options. + // TODO: Consider also tuning some options of the metadata CF (although this is small compared to nodes CFs). optsMeta := grocksdb.NewDefaultOptions() optsMeta.SetCreateIfMissing(true) optsMeta.SetCreateIfMissingColumnFamilies(true) @@ -120,40 +121,50 @@ func New(cfg *api.Config) (api.NodeDB, error) { // TODO: Consider separate options for state vs. io. optsNodes := grocksdb.NewDefaultOptions() optsNodes.SetCreateIfMissing(true) - optsNodes.SetComparator(createTimestampComparator()) optsNodes.IncreaseParallelism(runtime.NumCPU()) + + // General options. + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + optsNodes.SetLevelCompactionDynamicLevelBytes(true) + optsNodes.SetBytesPerSync(1048576) // 1 MB. optsNodes.OptimizeLevelStyleCompaction(512 * 1024 * 1024) optsNodes.SetTargetFileSizeMultiplier(2) - optsNodes.SetLevelCompactionDynamicLevelBytes(true) bbto := grocksdb.NewDefaultBlockBasedTableOptions() bbto.SetBlockSize(32 * 1024) + bbto.SetPinL0FilterAndIndexBlocksInCache(true) + // Configure block cache. Recommendation is 1/3 of memory budget. + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#block-cache-size if cfg.MaxCacheSize == 0 { - // Default to 64mb block cache size if not configured. - bbto.SetBlockCache(grocksdb.NewLRUCache(64 * 1024 * 1024)) + // Default to 128mb block cache size if not configured. + bbto.SetBlockCache(grocksdb.NewLRUCache(128 * 1024 * 1024)) } else { bbto.SetBlockCache(grocksdb.NewLRUCache(uint64(cfg.MaxCacheSize))) } + + // Configure query filter. + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#bloom-filters + // http://rocksdb.org/blog/2021/12/29/ribbon-filter.html bbto.SetFilterPolicy(grocksdb.NewRibbonHybridFilterPolicy(9.9, 1)) + bbto.SetOptimizeFiltersForMemory(true) + // https://github.com/facebook/rocksdb/wiki/Index-Block-Format#index_type--kbinarysearchwithfirstkey bbto.SetIndexType(grocksdb.KBinarySearchWithFirstKey) - optsNodes.SetBlockBasedTableFactory(bbto) - optsNodes.SetCompressionOptionsParallelThreads(4) - /* - // Apparently with dict compression the file writer doesn't report file size: - // https://github.com/facebook/rocksdb/issues/11146 - // compression options at bottommost level - opts.SetBottommostCompression(grocksdb.ZSTDCompression) - - compressOpts := grocksdb.NewDefaultCompressionOptions() - compressOpts.MaxDictBytes = 112640 // 110k - compressOpts.Level = 12 - - opts.SetBottommostCompressionOptions(compressOpts, true) - opts.SetBottommostCompressionOptionsZstdMaxTrainBytes(compressOpts.MaxDictBytes*100, true) + optsNodes.SetBlockBasedTableFactory(bbto) - */ + // Configure compression. + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#compression + optsNodes.SetCompression(grocksdb.LZ4Compression) + optsNodes.SetBottommostCompression(grocksdb.ZSTDCompression) + + // Configure ZSTD (follows Cosmos-SDK values). + compressOpts := grocksdb.NewDefaultCompressionOptions() + compressOpts.MaxDictBytes = 110 * 1024 // 110KB - typical size for ZSTD. + compressOpts.Level = 12 // Higher compression. + optsNodes.SetBottommostCompressionOptions(compressOpts, true) + optsNodes.SetBottommostCompressionOptionsZstdMaxTrainBytes(compressOpts.MaxDictBytes*100, true) // 100 * dict size. + optsNodes.SetCompressionOptionsParallelThreads(4) var err error var cfHandles []*grocksdb.ColumnFamilyHandle From adc0cfad1eeb53b749a38276fc3e6dda06587e82 Mon Sep 17 00:00:00 2001 From: ptrus Date: Wed, 8 Nov 2023 15:43:20 +0100 Subject: [PATCH 19/28] rocksdb: minor fixes --- go/storage/mkvs/db/rocksdb/rocksdb.go | 32 +++++++++++++++------------ 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 1835c4ba322..abd70d5fabe 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -34,6 +34,8 @@ var ( // rootsMetadataKeyFmt is the key format for roots metadata. The key format is (version). // // Value is CBOR-serialized rootsMetadata. + // TODO: The rootsMetadata is one per version, which means it can also get quite large, + // maybe use same db options as for nodes CFs? (minus the timestamps). rootsMetadataKeyFmt = keyformat.New(0x00, uint64(0)) // rootUpdatedNodesKeyFmt is the key format for the pending updated nodes for the @@ -271,12 +273,12 @@ func (d *rocksdbNodeDB) load() error { // Load metadata. item, err := d.db.Get(defaultReadOptions, metadataKeyFmt.Encode()) - switch err { - case nil: - if !item.Exists() { - break - } - defer item.Free() + if err != nil { + return err + } + defer item.Free() + switch { + case item.Exists(): // Metadata already exists, just load it and verify that it is // compatible with what we have here. @@ -296,16 +298,14 @@ func (d *rocksdbNodeDB) load() error { d.meta.value.Namespace, ) } - return nil default: - return err - } + // No metadata exists, create some. + d.meta.value.Version = dbVersion + d.meta.value.Namespace = d.namespace + if err = d.meta.save(d.db); err != nil { + return err + } - // No metadata exists, create some. - d.meta.value.Version = dbVersion - d.meta.value.Namespace = d.namespace - if err = d.meta.save(d.db); err != nil { - return err } return nil @@ -803,6 +803,7 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { } defer s.Free() if !s.Exists() { + ts.Free() return false } @@ -826,6 +827,9 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { batch.Delete(rootNodeKeyFmt.Encode(&rootHash)) } + // Prune roots metadata. + batch.Delete(rootsMetadataKeyFmt.Encode(version)) + // Prune all write logs in version. if !d.discardWriteLogs { discardLogs := func(cf *grocksdb.ColumnFamilyHandle) { From 40d90ad2600e2f98ab0ea097387a470583a56041 Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 9 Nov 2023 13:44:14 +0100 Subject: [PATCH 20/28] rocksdb: bump version --- go/go.mod | 5 +---- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/go/go.mod b/go/go.mod index 42af81780f6..2bea3fb20c5 100644 --- a/go/go.mod +++ b/go/go.mod @@ -1,10 +1,7 @@ module github.com/oasisprotocol/oasis-core/go replace ( - github.com/cometbft/cometbft => github.com/oasisprotocol/cometbft v0.37.2-oasis1 - // Temp. - github.com/linxGnu/grocksdb => github.com/ptrus/grocksdb v0.0.0-20231108081122-7a5267edc303 // v1.5.0 has broken uint parsing, use my commit with fixes instead until // the maintainers merge my PR: https://github.com/spf13/cast/pull/144 @@ -36,7 +33,7 @@ require ( github.com/ipfs/go-log/v2 v2.5.1 github.com/libp2p/go-libp2p v0.30.0 github.com/libp2p/go-libp2p-pubsub v0.9.3 - github.com/linxGnu/grocksdb v1.8.4 + github.com/linxGnu/grocksdb v1.8.5 github.com/multiformats/go-multiaddr v0.11.0 github.com/oasisprotocol/curve25519-voi v0.0.0-20230110094441-db37f07504ce github.com/oasisprotocol/deoxysii v0.0.0-20220228165953-2091330c22b7 diff --git a/go/go.sum b/go/go.sum index 109b275c849..10fd05deb2a 100644 --- a/go/go.sum +++ b/go/go.sum @@ -417,6 +417,8 @@ github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQsc github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5wwmtQP1YB4= +github.com/linxGnu/grocksdb v1.8.5 h1:Okfk5B1h0ikCYdDM7Tc5yJUS8LTwAmMBq5IPWTmOLPs= +github.com/linxGnu/grocksdb v1.8.5/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= @@ -569,8 +571,6 @@ github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= -github.com/ptrus/grocksdb v0.0.0-20231108081122-7a5267edc303 h1:nFxsXsak9DFLHjhEAZtJYCYcHI5Zp49LtaxNgVjPpIg= -github.com/ptrus/grocksdb v0.0.0-20231108081122-7a5267edc303/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/quic-go/qpack v0.4.0 h1:Cr9BXA1sQS2SmDUWjSofMPNKmvF6IiIfDRmgU0w1ZCo= github.com/quic-go/qpack v0.4.0/go.mod h1:UZVnYIfi5GRk+zI9UMaCPsmZ2xKJP7XBUvVyT1Knj9A= github.com/quic-go/qtls-go1-20 v0.3.2 h1:rRgN3WfnKbyik4dBV8A6girlJVxGand/d+jVKbQq5GI= From 556022e98d4d8a2e71f3cdf09b87fdcad54138da Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 9 Nov 2023 14:54:39 +0100 Subject: [PATCH 21/28] rocksdb: wip stats code --- go/storage/mkvs/db/rocksdb/rocksdb.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index abd70d5fabe..2d526b62621 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -168,6 +168,13 @@ func New(cfg *api.Config) (api.NodeDB, error) { optsNodes.SetBottommostCompressionOptionsZstdMaxTrainBytes(compressOpts.MaxDictBytes*100, true) // 100 * dict size. optsNodes.SetCompressionOptionsParallelThreads(4) + /* + // TODO: only enable statistics via a config param. + // 5-10% performance penalty with statistics based on documentation. + optsMeta.EnableStatistics() + optsNodes.EnableStatistics() + */ + var err error var cfHandles []*grocksdb.ColumnFamilyHandle switch cfg.ReadOnly { @@ -1017,3 +1024,14 @@ func (d *rocksdbNodeDB) Close() { d.db = nil }) } + +/* +func (d *rocksdbNodeDB) getStats() { + opts, err := grocksdb.LoadLatestOptions("path", nil, true, nil) + if err != nil { + panic(err) + } + defer opts.Destroy() + str := opts.Options().GetStatisticsString() +} +*/ From 21f5eff2c1859fa1b70b0c58791166febbfc8e07 Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 9 Nov 2023 15:51:22 +0100 Subject: [PATCH 22/28] rocksdb: separate CF for roots --- go/storage/mkvs/db/rocksdb/batch.go | 4 +- go/storage/mkvs/db/rocksdb/metadata.go | 10 +++-- go/storage/mkvs/db/rocksdb/rocksdb.go | 53 ++++++++++++++++---------- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index dbe4c298319..b80666c75b4 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -61,7 +61,7 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { return api.ErrAlreadyFinalized } - rootsMeta, err := loadRootsMetadata(ba.db.db, root.Version) + rootsMeta, err := loadRootsMetadata(ba.db.db, ba.db.cfRoots, root.Version) if err != nil { return err } @@ -102,7 +102,7 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { } var oldRootsMeta *rootsMetadata - oldRootsMeta, err = loadRootsMetadata(ba.db.db, ba.oldRoot.Version) + oldRootsMeta, err = loadRootsMetadata(ba.db.db, ba.db.cfRoots, ba.oldRoot.Version) if err != nil { return err } diff --git a/go/storage/mkvs/db/rocksdb/metadata.go b/go/storage/mkvs/db/rocksdb/metadata.go index 2e11c6ffda8..c5255005a8d 100644 --- a/go/storage/mkvs/db/rocksdb/metadata.go +++ b/go/storage/mkvs/db/rocksdb/metadata.go @@ -126,15 +126,17 @@ type rootsMetadata struct { // Roots is the map of a root created in a version to any derived roots (in this or later versions). Roots map[node.TypedHash][]node.TypedHash + rootsCf *grocksdb.ColumnFamilyHandle + // version is the version this metadata is for. version uint64 } // loadRootsMetadata loads the roots metadata for the given version from the database. -func loadRootsMetadata(db *grocksdb.DB, version uint64) (*rootsMetadata, error) { - rootsMeta := &rootsMetadata{version: version} +func loadRootsMetadata(db *grocksdb.DB, cf *grocksdb.ColumnFamilyHandle, version uint64) (*rootsMetadata, error) { + rootsMeta := &rootsMetadata{version: version, rootsCf: cf} - s, err := db.Get(defaultReadOptions, rootsMetadataKeyFmt.Encode(version)) + s, err := db.GetCF(defaultReadOptions, cf, rootsMetadataKeyFmt.Encode(version)) if err != nil { return nil, fmt.Errorf("mkvs/rocksdb: failed to get roots metadata from backing store: %w", err) } @@ -152,5 +154,5 @@ func loadRootsMetadata(db *grocksdb.DB, version uint64) (*rootsMetadata, error) // save saves the roots metadata to the database. func (rm *rootsMetadata) save(batch *grocksdb.WriteBatch) { - batch.Put(rootsMetadataKeyFmt.Encode(rm.version), cbor.Marshal(rm)) + batch.PutCF(rm.rootsCf, rootsMetadataKeyFmt.Encode(rm.version), cbor.Marshal(rm)) } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 2d526b62621..7396d657b31 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -29,26 +29,19 @@ const ( multipartVersionNone uint64 = 0 ) -// Metadata CF keys (not timestamped). +// Metadata (default) CF keys (not timestamped). var ( - // rootsMetadataKeyFmt is the key format for roots metadata. The key format is (version). - // - // Value is CBOR-serialized rootsMetadata. - // TODO: The rootsMetadata is one per version, which means it can also get quite large, - // maybe use same db options as for nodes CFs? (minus the timestamps). - rootsMetadataKeyFmt = keyformat.New(0x00, uint64(0)) - // rootUpdatedNodesKeyFmt is the key format for the pending updated nodes for the // given root that need to be removed only in case the given root is not among // the finalized roots. They key format is (version, root). // // Value is CBOR-serialized []updatedNode. - rootUpdatedNodesKeyFmt = keyformat.New(0x01, uint64(0), &node.TypedHash{}) + rootUpdatedNodesKeyFmt = keyformat.New(0x00, uint64(0), &node.TypedHash{}) // metadataKeyFmt is the key format for metadata. // // Value is CBOR-serialized metadata. - metadataKeyFmt = keyformat.New(0x02) + metadataKeyFmt = keyformat.New(0x01) // multipartRestoreNodeLogKeyFmt is the key format for the nodes inserted during a chunk restore. // Once a set of chunks is fully restored, these entries should be removed. If chunk restoration @@ -56,7 +49,7 @@ var ( // with these entries. // // Value is empty. - multipartRestoreNodeLogKeyFmt = keyformat.New(0x03, &node.TypedHash{}) + multipartRestoreNodeLogKeyFmt = keyformat.New(0x02, &node.TypedHash{}) // multipartRestoreNodeLogKeyFmt is the key format for the root nodes inserted during a chunk restore. // Once a set of chunks is fully restored, these entries should be removed. If chunk restoration @@ -64,7 +57,15 @@ var ( // with these entries. // // Value is empty. - multipartRestoreRootLogKeyFmt = keyformat.New(0x04, &node.TypedHash{}) + multipartRestoreRootLogKeyFmt = keyformat.New(0x03, &node.TypedHash{}) +) + +// Roots CF keys (not timestamped). +var ( + // rootsMetadataKeyFmt is the key format for roots metadata. The key format is (version). + // + // Value is CBOR-serialized rootsMetadata. + rootsMetadataKeyFmt = keyformat.New(0x00, uint64(0)) ) // Node CF keys (timestamped and used by state and io tree CFs). @@ -94,6 +95,7 @@ var ( const ( cfMetadataName = "default" + cfRootsName = "roots" cfStateTreeName = "state_tree" cfIOTreeName = "io_tree" ) @@ -134,6 +136,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { optsNodes.SetTargetFileSizeMultiplier(2) bbto := grocksdb.NewDefaultBlockBasedTableOptions() + bbto.SetFormatVersion(4) // Latest version format, default uses older backwards compatible one. bbto.SetBlockSize(32 * 1024) bbto.SetPinL0FilterAndIndexBlocksInCache(true) // Configure block cache. Recommendation is 1/3 of memory budget. @@ -184,6 +187,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { cfg.DB, []string{ cfMetadataName, + cfRootsName, cfStateTreeName, cfIOTreeName, }, @@ -191,6 +195,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { optsMeta, optsNodes, optsNodes, + optsNodes, }, false) case false: @@ -199,6 +204,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { cfg.DB, []string{ cfMetadataName, + cfRootsName, cfStateTreeName, cfIOTreeName, }, @@ -206,6 +212,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { optsMeta, optsNodes, optsNodes, + optsNodes, }, ) } @@ -213,8 +220,9 @@ func New(cfg *api.Config) (api.NodeDB, error) { return nil, fmt.Errorf("mkvs/rocksdb: failed to open database: %w", err) } db.cfMetadata = cfHandles[0] // Also the default handle. - db.cfStateTree = cfHandles[1] - db.cfIOTree = cfHandles[2] + db.cfRoots = cfHandles[1] + db.cfStateTree = cfHandles[2] + db.cfIOTree = cfHandles[3] // Load database metadata. if err = db.load(); err != nil { @@ -248,6 +256,7 @@ type rocksdbNodeDB struct { db *grocksdb.DB cfMetadata *grocksdb.ColumnFamilyHandle + cfRoots *grocksdb.ColumnFamilyHandle cfStateTree *grocksdb.ColumnFamilyHandle cfIOTree *grocksdb.ColumnFamilyHandle @@ -549,7 +558,7 @@ func (d *rocksdbNodeDB) GetRootsForVersion(version uint64) ([]node.Root, error) return nil, nil } - rootsMeta, err := loadRootsMetadata(d.db, version) + rootsMeta, err := loadRootsMetadata(d.db, d.cfRoots, version) if err != nil { return nil, err } @@ -581,7 +590,7 @@ func (d *rocksdbNodeDB) HasRoot(root node.Root) bool { return false } - rootsMeta, err := loadRootsMetadata(d.db, root.Version) + rootsMeta, err := loadRootsMetadata(d.db, d.cfRoots, root.Version) if err != nil { panic(err) } @@ -623,7 +632,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo finalizedRoots[node.TypedHashFromRoot(root)] = true } var rootsChanged bool - rootsMeta, err := loadRootsMetadata(d.db, version) + rootsMeta, err := loadRootsMetadata(d.db, d.cfRoots, version) if err != nil { return err } @@ -777,7 +786,7 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { return api.ErrNotEarliest } - rootsMeta, err := loadRootsMetadata(d.db, version) + rootsMeta, err := loadRootsMetadata(d.db, d.cfRoots, version) if err != nil { return err } @@ -835,7 +844,7 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { } // Prune roots metadata. - batch.Delete(rootsMetadataKeyFmt.Encode(version)) + batch.DeleteCF(d.cfRoots, rootsMetadataKeyFmt.Encode(version)) // Prune all write logs in version. if !d.discardWriteLogs { @@ -1005,20 +1014,22 @@ func (d *rocksdbNodeDB) NewBatch(oldRoot node.Root, version uint64, chunk bool) func (d *rocksdbNodeDB) Size() (uint64, error) { meta := d.db.GetColumnFamilyMetadataCF(d.cfMetadata) + roots := d.db.GetColumnFamilyMetadataCF(d.cfRoots) io := d.db.GetColumnFamilyMetadataCF(d.cfIOTree) state := d.db.GetColumnFamilyMetadataCF(d.cfStateTree) - return meta.Size() + io.Size() + state.Size(), nil + return meta.Size() + roots.Size() + io.Size() + state.Size(), nil } func (d *rocksdbNodeDB) Sync() error { - return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfIOTree, d.cfStateTree}, defaultFlushOptions) + return d.db.FlushCFs([]*grocksdb.ColumnFamilyHandle{d.cfMetadata, d.cfRoots, d.cfIOTree, d.cfStateTree}, defaultFlushOptions) } func (d *rocksdbNodeDB) Close() { d.closeOnce.Do(func() { d.db.Close() d.cfMetadata = nil + d.cfRoots = nil d.cfIOTree = nil d.cfStateTree = nil d.db = nil From 7369a08df94fafca783a15152a09bcaf072733fd Mon Sep 17 00:00:00 2001 From: ptrus Date: Thu, 9 Nov 2023 17:09:40 +0100 Subject: [PATCH 23/28] fix --- go/storage/mkvs/db/rocksdb/rocksdb.go | 78 +++++++++++++++------------ 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 7396d657b31..b3a955caf54 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -100,40 +100,27 @@ const ( cfIOTreeName = "io_tree" ) -// New creates a new RocksDB-backed node database. -func New(cfg *api.Config) (api.NodeDB, error) { - db := &rocksdbNodeDB{ - logger: logging.GetLogger("mkvs/db/rocksdb"), - namespace: cfg.Namespace, - discardWriteLogs: cfg.DiscardWriteLogs, - readOnly: cfg.ReadOnly, - } - +func newOptions(versioned bool, maxCacheSize int64) *grocksdb.Options { // XXX: The options bellow were taken from a combination of: // - Cosmos-SDK RocksDB implementation // - https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide // - https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning // Experiment/modify if needed. - // Create options for the metadata column family. - // TODO: Consider also tuning some options of the metadata CF (although this is small compared to nodes CFs). - optsMeta := grocksdb.NewDefaultOptions() - optsMeta.SetCreateIfMissing(true) - optsMeta.SetCreateIfMissingColumnFamilies(true) - - // Create options for the node column families. // TODO: Consider separate options for state vs. io. - optsNodes := grocksdb.NewDefaultOptions() - optsNodes.SetCreateIfMissing(true) - optsNodes.SetComparator(createTimestampComparator()) - optsNodes.IncreaseParallelism(runtime.NumCPU()) + opts := grocksdb.NewDefaultOptions() + opts.SetCreateIfMissing(true) + if versioned { + opts.SetComparator(createTimestampComparator()) + } + opts.IncreaseParallelism(runtime.NumCPU()) // General options. // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options - optsNodes.SetLevelCompactionDynamicLevelBytes(true) - optsNodes.SetBytesPerSync(1048576) // 1 MB. - optsNodes.OptimizeLevelStyleCompaction(512 * 1024 * 1024) - optsNodes.SetTargetFileSizeMultiplier(2) + opts.SetLevelCompactionDynamicLevelBytes(true) + opts.SetBytesPerSync(1048576) // 1 MB. + opts.OptimizeLevelStyleCompaction(512 * 1024 * 1024) + opts.SetTargetFileSizeMultiplier(2) bbto := grocksdb.NewDefaultBlockBasedTableOptions() bbto.SetFormatVersion(4) // Latest version format, default uses older backwards compatible one. @@ -141,11 +128,11 @@ func New(cfg *api.Config) (api.NodeDB, error) { bbto.SetPinL0FilterAndIndexBlocksInCache(true) // Configure block cache. Recommendation is 1/3 of memory budget. // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#block-cache-size - if cfg.MaxCacheSize == 0 { + if maxCacheSize == 0 { // Default to 128mb block cache size if not configured. bbto.SetBlockCache(grocksdb.NewLRUCache(128 * 1024 * 1024)) } else { - bbto.SetBlockCache(grocksdb.NewLRUCache(uint64(cfg.MaxCacheSize))) + bbto.SetBlockCache(grocksdb.NewLRUCache(uint64(maxCacheSize))) } // Configure query filter. @@ -156,20 +143,20 @@ func New(cfg *api.Config) (api.NodeDB, error) { // https://github.com/facebook/rocksdb/wiki/Index-Block-Format#index_type--kbinarysearchwithfirstkey bbto.SetIndexType(grocksdb.KBinarySearchWithFirstKey) - optsNodes.SetBlockBasedTableFactory(bbto) + opts.SetBlockBasedTableFactory(bbto) // Configure compression. // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#compression - optsNodes.SetCompression(grocksdb.LZ4Compression) - optsNodes.SetBottommostCompression(grocksdb.ZSTDCompression) + opts.SetCompression(grocksdb.LZ4Compression) + opts.SetBottommostCompression(grocksdb.ZSTDCompression) // Configure ZSTD (follows Cosmos-SDK values). compressOpts := grocksdb.NewDefaultCompressionOptions() compressOpts.MaxDictBytes = 110 * 1024 // 110KB - typical size for ZSTD. compressOpts.Level = 12 // Higher compression. - optsNodes.SetBottommostCompressionOptions(compressOpts, true) - optsNodes.SetBottommostCompressionOptionsZstdMaxTrainBytes(compressOpts.MaxDictBytes*100, true) // 100 * dict size. - optsNodes.SetCompressionOptionsParallelThreads(4) + opts.SetBottommostCompressionOptions(compressOpts, true) + opts.SetBottommostCompressionOptionsZstdMaxTrainBytes(compressOpts.MaxDictBytes*100, true) // 100 * dict size. + opts.SetCompressionOptionsParallelThreads(4) /* // TODO: only enable statistics via a config param. @@ -178,6 +165,29 @@ func New(cfg *api.Config) (api.NodeDB, error) { optsNodes.EnableStatistics() */ + return opts +} + +// New creates a new RocksDB-backed node database. +func New(cfg *api.Config) (api.NodeDB, error) { + db := &rocksdbNodeDB{ + logger: logging.GetLogger("mkvs/db/rocksdb"), + namespace: cfg.Namespace, + discardWriteLogs: cfg.DiscardWriteLogs, + readOnly: cfg.ReadOnly, + } + + // Create options for the metadata column family. + // TODO: Consider also tuning some options of the metadata CF (although this is small compared to nodes CFs). + optsMeta := grocksdb.NewDefaultOptions() + optsMeta.SetCreateIfMissing(true) + optsMeta.SetCreateIfMissingColumnFamilies(true) + + // Create options for the node column families. + optsRoots := newOptions(false, cfg.MaxCacheSize) + // TODO: Consider separate options for state vs. io. + optsNodes := newOptions(true, cfg.MaxCacheSize) + var err error var cfHandles []*grocksdb.ColumnFamilyHandle switch cfg.ReadOnly { @@ -193,7 +203,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { }, []*grocksdb.Options{ optsMeta, - optsNodes, + optsRoots, optsNodes, optsNodes, }, @@ -210,7 +220,7 @@ func New(cfg *api.Config) (api.NodeDB, error) { }, []*grocksdb.Options{ optsMeta, - optsNodes, + optsRoots, optsNodes, optsNodes, }, From 840a6c0b9a474bb72af3ed16eaf9a81a9849a635 Mon Sep 17 00:00:00 2001 From: ptrus Date: Fri, 10 Nov 2023 16:19:12 +0100 Subject: [PATCH 24/28] rocksdb: use ptcmalloc in dockerfile --- docker/oasis-core-dev/Dockerfile | 18 ++++++++++++++---- docs/development-setup/prerequisites.md | 2 +- go/storage/mkvs/db/rocksdb/batch.go | 7 ++++--- go/storage/mkvs/db/rocksdb/rocksdb_test.go | 2 ++ 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/docker/oasis-core-dev/Dockerfile b/docker/oasis-core-dev/Dockerfile index faa9066bcfe..83990d80a16 100644 --- a/docker/oasis-core-dev/Dockerfile +++ b/docker/oasis-core-dev/Dockerfile @@ -85,7 +85,7 @@ RUN wget https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz && \ go install mvdan.cc/gofumpt@${GOFUMPT_VERSION} && \ go install golang.org/x/tools/cmd/goimports@${GOIMPORTS_VERSION} -# Install jemalloc (used by BadgerDB). +# Install jemalloc for BadgerDB ('je_' API prefix). RUN wget -O jemalloc.tar.bz2 \ https://github.com/jemalloc/jemalloc/releases/download/${JEMALLOC_VERSION}/jemalloc-${JEMALLOC_VERSION}.tar.bz2 && \ # Ensure checksum matches. @@ -101,15 +101,25 @@ RUN wget -O jemalloc.tar.bz2 \ cd .. && rm jemalloc.tar.bz2 && rm -rf jemalloc-${JEMALLOC_VERSION} # Install RocksDB. +# https://github.com/facebook/rocksdb/blob/main/INSTALL.md RUN wget -q https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz && \ # Ensure checksum matches. echo "${ROCKSDB_CHECKSUM} v${ROCKSDB_VERSION}.tar.gz" | sha256sum -c && \ tar -zxf v${ROCKSDB_VERSION}.tar.gz && \ cd rocksdb-${ROCKSDB_VERSION} && \ - # TODO: clashes with jemalloc used by BadgerDB. + # In production it's recomended to use either tcmalloc or jemalloc instead of ptmalloc: + # https://blog.cloudflare.com/the-effect-of-switching-to-tcmalloc-on-rocksdb-memory-use/ + # However RocksDB jemalloc requirement clashes with BadgerDB (RocksDB requires it without 'je_' prefix). + # Using tcmalloc next to badgerdb's jemalloc also causes problems. Therefore we default to ptmalloc for + # the docker container. If/when RocksDB becomes the prefered database, we should default to building + # BadgerDB without jemalloc and use tcmalloc/jemalloc here. + # + # Disable jemalloc as it is used by default if found on the system. + ROCKSDB_DISABLE_JEMALLOC=1 \ # For 64-bit x86 the `PORTABLE=haswell` is a reasonable compromise, which supports many or most # of the available optimizations while still being compatible with most processors made since - # roughly 2013. https://github.com/facebook/rocksdb/blob/main/INSTALL.md - DEBUG_LEVEL=0 ROCKSDB_DISABLE_JEMALLOC=1 PORTABLE=haswell make -j4 shared_lib && \ + # roughly 2013. + PORTABLE=haswell \ + DEBUG_LEVEL=0 make -j4 shared_lib && \ make install-shared && ldconfig && \ cd .. && rm -rf v${ROCKSDB_VERSION}.tar.gz rocksdb-${ROCKSDB_VERSION} diff --git a/docs/development-setup/prerequisites.md b/docs/development-setup/prerequisites.md index 4b39b0efb1f..5b348d37e29 100644 --- a/docs/development-setup/prerequisites.md +++ b/docs/development-setup/prerequisites.md @@ -232,7 +232,7 @@ Core: echo "${ROCKSDB_CHECKSUM} rocksdb.tar.gz" | sha256sum -c tar -zxf rocksdb.tar.gz cd rocksdb-${ROCKSDB_VERSION} - DEBUG_LEVEL=0 ROCKSDB_DISABLE_JEMALLOC=1 make -j4 shared_lib + DEBUG_LEVEL=0 ROCKSDB_DISABLE_MALLOC_USABLE_SIZE=1 ROCKSDB_DISABLE_JEMALLOC=1 make -j4 shared_lib sudo make install-shared sudo ldconfig popd diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index b80666c75b4..38582032d90 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -106,8 +106,9 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { if err != nil { return err } - // Check if oldRootsMeta was updated in this batch. - // TODO: could this be avoided? + + // Check if oldRootsMeta was updated in this batch. Since we don't see the + // batch updates, until the batch is applied. wbIter := ba.bat.NewIterator() for { if !wbIter.Next() { @@ -115,7 +116,7 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { } rec := wbIter.Record() if bytes.Equal(rec.Key, rootsMetadataKeyFmt.Encode(ba.oldRoot.Version)) { - if rec.Type == grocksdb.WriteBatchValueRecord { + if rec.Type == grocksdb.WriteBatchCFValueRecord { if err = cbor.Unmarshal(rec.Value, &oldRootsMeta); err != nil { panic(err) } diff --git a/go/storage/mkvs/db/rocksdb/rocksdb_test.go b/go/storage/mkvs/db/rocksdb/rocksdb_test.go index 567d2f2cdd9..fb87096c043 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb_test.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb_test.go @@ -121,6 +121,7 @@ func createCheckpoint(ctx context.Context, require *require.Assertions, dir stri nodeKeys[string(it.Key())] = struct{}{} } } + return nil }) } loadNodes(rocksdb.cfIOTree) @@ -148,6 +149,7 @@ func verifyNodes(require *require.Assertions, rocksdb *rocksdbNodeDB, version ui require.Equal(true, ok, "unexpected node in db") delete(notVisited, string(key)) } + return nil }) } checkNodes(rocksdb.cfIOTree) From 3e0d37717427cb9088d03b17674b9c5171937f44 Mon Sep 17 00:00:00 2001 From: ptrus Date: Tue, 5 Dec 2023 10:40:42 +0100 Subject: [PATCH 25/28] bump gorocksdb --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 2bea3fb20c5..7f8e84b8ad4 100644 --- a/go/go.mod +++ b/go/go.mod @@ -33,7 +33,7 @@ require ( github.com/ipfs/go-log/v2 v2.5.1 github.com/libp2p/go-libp2p v0.30.0 github.com/libp2p/go-libp2p-pubsub v0.9.3 - github.com/linxGnu/grocksdb v1.8.5 + github.com/linxGnu/grocksdb v1.8.6 github.com/multiformats/go-multiaddr v0.11.0 github.com/oasisprotocol/curve25519-voi v0.0.0-20230110094441-db37f07504ce github.com/oasisprotocol/deoxysii v0.0.0-20220228165953-2091330c22b7 diff --git a/go/go.sum b/go/go.sum index 10fd05deb2a..ff90ce38757 100644 --- a/go/go.sum +++ b/go/go.sum @@ -417,8 +417,8 @@ github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQsc github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5wwmtQP1YB4= -github.com/linxGnu/grocksdb v1.8.5 h1:Okfk5B1h0ikCYdDM7Tc5yJUS8LTwAmMBq5IPWTmOLPs= -github.com/linxGnu/grocksdb v1.8.5/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= +github.com/linxGnu/grocksdb v1.8.6 h1:O7I6SIGPrypf3f/gmrrLUBQDKfO8uOoYdWf4gLS06tc= +github.com/linxGnu/grocksdb v1.8.6/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= From a8bd21af1d87a052a2670f11955d589ba76b56b1 Mon Sep 17 00:00:00 2001 From: ptrus Date: Wed, 13 Dec 2023 08:29:05 +0100 Subject: [PATCH 26/28] rocksdb: sync writes and additional checks --- go/storage/mkvs/db/rocksdb/batch.go | 4 ++-- go/storage/mkvs/db/rocksdb/metadata.go | 8 +++---- go/storage/mkvs/db/rocksdb/rocksdb.go | 29 ++++++++++++++++---------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/batch.go b/go/storage/mkvs/db/rocksdb/batch.go index 38582032d90..944203d9c82 100644 --- a/go/storage/mkvs/db/rocksdb/batch.go +++ b/go/storage/mkvs/db/rocksdb/batch.go @@ -147,11 +147,11 @@ func (ba *rocksdbBatch) Commit(root node.Root) error { // Flush node updates. if ba.multipartNodes != nil { - if err = ba.db.db.Write(defaultWriteOptions, ba.multipartNodes); err != nil { + if err = ba.db.db.Write(ba.db.defaultWriteOptions, ba.multipartNodes); err != nil { return fmt.Errorf("mkvs/rocksdb: failed to flush node log batch: %w", err) } } - if err = ba.db.db.Write(defaultWriteOptions, ba.bat); err != nil { + if err = ba.db.db.Write(ba.db.defaultWriteOptions, ba.bat); err != nil { return fmt.Errorf("mkvs/rocksdb: failed to flush batch: %w", err) } diff --git a/go/storage/mkvs/db/rocksdb/metadata.go b/go/storage/mkvs/db/rocksdb/metadata.go index c5255005a8d..cde70793cc1 100644 --- a/go/storage/mkvs/db/rocksdb/metadata.go +++ b/go/storage/mkvs/db/rocksdb/metadata.go @@ -90,16 +90,16 @@ func (m *metadata) getMultipartVersion() uint64 { return m.value.MultipartVersion } -func (m *metadata) setMultipartVersion(db *grocksdb.DB, version uint64) error { +func (m *metadata) setMultipartVersion(db *grocksdb.DB, wo *grocksdb.WriteOptions, version uint64) error { m.Lock() defer m.Unlock() m.value.MultipartVersion = version - return m.save(db) + return m.save(db, wo) } -func (m *metadata) save(db *grocksdb.DB) error { - return db.Put(defaultWriteOptions, metadataKeyFmt.Encode(), cbor.Marshal(m.value)) +func (m *metadata) save(db *grocksdb.DB, wo *grocksdb.WriteOptions) error { + return db.Put(wo, metadataKeyFmt.Encode(), cbor.Marshal(m.value)) } // TODO: Collaps with save. diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index b3a955caf54..0a20809ff57 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -88,7 +88,6 @@ var ( ) var ( - defaultWriteOptions = grocksdb.NewDefaultWriteOptions() defaultReadOptions = grocksdb.NewDefaultReadOptions() defaultFlushOptions = grocksdb.NewDefaultFlushOptions() ) @@ -109,6 +108,7 @@ func newOptions(versioned bool, maxCacheSize int64) *grocksdb.Options { // TODO: Consider separate options for state vs. io. opts := grocksdb.NewDefaultOptions() + opts.SetParanoidChecks(true) opts.SetCreateIfMissing(true) if versioned { opts.SetComparator(createTimestampComparator()) @@ -171,11 +171,14 @@ func newOptions(versioned bool, maxCacheSize int64) *grocksdb.Options { // New creates a new RocksDB-backed node database. func New(cfg *api.Config) (api.NodeDB, error) { db := &rocksdbNodeDB{ - logger: logging.GetLogger("mkvs/db/rocksdb"), - namespace: cfg.Namespace, - discardWriteLogs: cfg.DiscardWriteLogs, - readOnly: cfg.ReadOnly, + logger: logging.GetLogger("mkvs/db/rocksdb"), + namespace: cfg.Namespace, + discardWriteLogs: cfg.DiscardWriteLogs, + readOnly: cfg.ReadOnly, + defaultWriteOptions: grocksdb.NewDefaultWriteOptions(), } + // Configure fsync. + db.defaultWriteOptions.SetSync(!cfg.NoFsync) // Create options for the metadata column family. // TODO: Consider also tuning some options of the metadata CF (although this is small compared to nodes CFs). @@ -270,6 +273,8 @@ type rocksdbNodeDB struct { cfStateTree *grocksdb.ColumnFamilyHandle cfIOTree *grocksdb.ColumnFamilyHandle + defaultWriteOptions *grocksdb.WriteOptions + closeOnce sync.Once } @@ -328,7 +333,7 @@ func (d *rocksdbNodeDB) load() error { // No metadata exists, create some. d.meta.value.Version = dbVersion d.meta.value.Namespace = d.namespace - if err = d.meta.save(d.db); err != nil { + if err = d.meta.save(d.db, d.defaultWriteOptions); err != nil { return err } @@ -763,7 +768,7 @@ func (d *rocksdbNodeDB) Finalize(roots []node.Root) error { // nolint: gocyclo d.meta.setLastFinalizedVersion(batch, version) // Commit batch. - if err := d.db.Write(defaultWriteOptions, batch); err != nil { + if err := d.db.Write(d.defaultWriteOptions, batch); err != nil { return fmt.Errorf("mkvs/rocksdb: failed to commit finalized roots: %w", err) } @@ -875,7 +880,7 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { // Update metadata. d.meta.setEarliestVersion(batch, version+1) - if err := d.db.Write(defaultWriteOptions, batch); err != nil { + if err := d.db.Write(d.defaultWriteOptions, batch); err != nil { return fmt.Errorf("mkvs/rocksdb: failed to prune version %d: %w", version, err) } @@ -905,7 +910,7 @@ func (d *rocksdbNodeDB) StartMultipartInsert(version uint64) error { return nil } - if err := d.meta.setMultipartVersion(d.db, version); err != nil { + if err := d.meta.setMultipartVersion(d.db, d.defaultWriteOptions, version); err != nil { return err } d.multipartVersion = version @@ -976,11 +981,11 @@ func (d *rocksdbNodeDB) cleanMultipartLocked(removeNodes bool) error { // Apply the batch first. If anything fails, having corrupt // multipart info in d.meta shouldn't hurt us next run. - if err := d.db.Write(defaultWriteOptions, batch); err != nil { + if err := d.db.Write(d.defaultWriteOptions, batch); err != nil { return err } - if err := d.meta.setMultipartVersion(d.db, multipartVersionNone); err != nil { + if err := d.meta.setMultipartVersion(d.db, d.defaultWriteOptions, multipartVersionNone); err != nil { return err } @@ -1037,11 +1042,13 @@ func (d *rocksdbNodeDB) Sync() error { func (d *rocksdbNodeDB) Close() { d.closeOnce.Do(func() { + d.defaultWriteOptions.Destroy() d.db.Close() d.cfMetadata = nil d.cfRoots = nil d.cfIOTree = nil d.cfStateTree = nil + d.defaultWriteOptions = nil d.db = nil }) } From 73a3afc795c7ed05bdd9999058419873e6b52278 Mon Sep 17 00:00:00 2001 From: ptrus Date: Tue, 19 Dec 2023 10:24:44 +0100 Subject: [PATCH 27/28] rocksdb: fix --- go/storage/mkvs/db/rocksdb/iterator.go | 5 +++++ go/storage/mkvs/db/rocksdb/rocksdb.go | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/go/storage/mkvs/db/rocksdb/iterator.go b/go/storage/mkvs/db/rocksdb/iterator.go index 663c5d5cff0..208bd0bd87b 100644 --- a/go/storage/mkvs/db/rocksdb/iterator.go +++ b/go/storage/mkvs/db/rocksdb/iterator.go @@ -81,6 +81,11 @@ func (itr *iterator) Key() []byte { return copyAndFreeSlice(itr.source.Key()) } +func (itr *iterator) Timestamp() []byte { + itr.assertIsValid() + return copyAndFreeSlice(itr.source.Timestamp()) +} + func (itr *iterator) Value() []byte { itr.assertIsValid() return copyAndFreeSlice(itr.source.Value()) diff --git a/go/storage/mkvs/db/rocksdb/rocksdb.go b/go/storage/mkvs/db/rocksdb/rocksdb.go index 0a20809ff57..178254ad286 100644 --- a/go/storage/mkvs/db/rocksdb/rocksdb.go +++ b/go/storage/mkvs/db/rocksdb/rocksdb.go @@ -108,7 +108,7 @@ func newOptions(versioned bool, maxCacheSize int64) *grocksdb.Options { // TODO: Consider separate options for state vs. io. opts := grocksdb.NewDefaultOptions() - opts.SetParanoidChecks(true) + // opts.SetParanoidChecks(true) opts.SetCreateIfMissing(true) if versioned { opts.SetComparator(createTimestampComparator()) @@ -828,23 +828,23 @@ func (d *rocksdbNodeDB) Prune(ctx context.Context, version uint64) error { itRo := timestampReadOptions(root.Version) defer itRo.Destroy() - s, ts, err := d.db.GetCFWithTS(itRo, cf, nodeKeyFmt.Encode(&h)) + s, itemTs, err := d.db.GetCFWithTS(itRo, cf, nodeKeyFmt.Encode(&h)) if err != nil { return false } defer s.Free() if !s.Exists() { - ts.Free() + itemTs.Free() return false } - itemTs, err := versionFromTimestamp(ts) + itemVersion, err := versionFromTimestamp(itemTs) if err != nil { // Shouldn't happen unless corrupted db. panic(fmt.Errorf("mkvs/rocksdb: missing/corrupted timestamp for node: %s", h)) } - if itemTs == version { - batch.DeleteCFWithTS(cf, nodeKeyFmt.Encode(&h), ts.Data()) + if itemVersion == version { + batch.DeleteCFWithTS(cf, nodeKeyFmt.Encode(&h), ts[:]) } return true }) From fba73421325248720ae5224323ba3531a9c9b72f Mon Sep 17 00:00:00 2001 From: ptrus Date: Tue, 19 Dec 2023 10:30:16 +0100 Subject: [PATCH 28/28] rocksdb: update to 8.8.1 --- docker/oasis-core-dev/Dockerfile | 4 ++-- docs/development-setup/prerequisites.md | 6 +++--- go/go.mod | 2 +- go/go.sum | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker/oasis-core-dev/Dockerfile b/docker/oasis-core-dev/Dockerfile index 83990d80a16..4dd172e1693 100644 --- a/docker/oasis-core-dev/Dockerfile +++ b/docker/oasis-core-dev/Dockerfile @@ -13,8 +13,8 @@ ARG GOIMPORTS_VERSION=v0.12.0 ARG RUST_NIGHTLY_VERSION=2023-01-16 ARG JEMALLOC_VERSION=5.2.1 ARG JEMALLOC_CHECKSUM=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6 -ARG ROCKSDB_VERSION=8.5.3 -ARG ROCKSDB_CHECKSUM=ed4230500b9ca20bc7918c32166b2d0d46a8695c59991821daa586d55689d785 +ARG ROCKSDB_VERSION=8.8.1 +ARG ROCKSDB_CHECKSUM=056c7e21ad8ae36b026ac3b94b9d6e0fcc60e1d937fc80330921e4181be5c36e # Legacy package versions (upgrade tests). ARG LEGACY_GO_VERSION=1.20.2 diff --git a/docs/development-setup/prerequisites.md b/docs/development-setup/prerequisites.md index 5b348d37e29..ff761ca240e 100644 --- a/docs/development-setup/prerequisites.md +++ b/docs/development-setup/prerequisites.md @@ -210,7 +210,7 @@ Core: (i.e. you can't use `./configure --prefix=$HOME/.local ...`) because upstream authors [hardcode its path][jemalloc-hardcode-path]._ -* (**OPTIONAL**) [rocksdb] (version 8.5.3) +* (**OPTIONAL**) [rocksdb] (version 8.8.1) # TODO: investigate clashing with jemalloc built above. Alternatively set `OASIS_NO_ROCKSDB="1"` environment variable when building @@ -223,8 +223,8 @@ Core: # Install prerequsites. apt install libgflags-dev libsnappy-dev libbz2-dev liblz4-dev libzstd-dev # Build RocksDB. - ROCKSDB_VERSION=8.5.3 - ROCKSDB_CHECKSUM=ed4230500b9ca20bc7918c32166b2d0d46a8695c59991821daa586d55689d785 + ROCKSDB_VERSION=8.8.1 + ROCKSDB_CHECKSUM=056c7e21ad8ae36b026ac3b94b9d6e0fcc60e1d937fc80330921e4181be5c36e pushd $(mktemp -d) wget -O rocksdb.tar.gz \ https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz diff --git a/go/go.mod b/go/go.mod index 7f8e84b8ad4..31da9284164 100644 --- a/go/go.mod +++ b/go/go.mod @@ -33,7 +33,7 @@ require ( github.com/ipfs/go-log/v2 v2.5.1 github.com/libp2p/go-libp2p v0.30.0 github.com/libp2p/go-libp2p-pubsub v0.9.3 - github.com/linxGnu/grocksdb v1.8.6 + github.com/linxGnu/grocksdb v1.8.10 github.com/multiformats/go-multiaddr v0.11.0 github.com/oasisprotocol/curve25519-voi v0.0.0-20230110094441-db37f07504ce github.com/oasisprotocol/deoxysii v0.0.0-20220228165953-2091330c22b7 diff --git a/go/go.sum b/go/go.sum index ff90ce38757..969745e4f5b 100644 --- a/go/go.sum +++ b/go/go.sum @@ -417,8 +417,8 @@ github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQsc github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5wwmtQP1YB4= -github.com/linxGnu/grocksdb v1.8.6 h1:O7I6SIGPrypf3f/gmrrLUBQDKfO8uOoYdWf4gLS06tc= -github.com/linxGnu/grocksdb v1.8.6/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= +github.com/linxGnu/grocksdb v1.8.10 h1:6FAhBThErRfJaevGOZISYvkG7RD4gfzeq452X4r8pes= +github.com/linxGnu/grocksdb v1.8.10/go.mod h1:xZCIb5Muw+nhbDK4Y5UJuOrin5MceOuiXkVUR7vp4WY= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY=