From 4e936bd75071df78c9bcd3bee81720b40ef08966 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 16 Apr 2026 17:07:44 -0400 Subject: [PATCH 01/15] perf: speed up historical MVCC reads and trace replays MVCC PebbleDB changes: - Invert version ordering so newest versions sort first per key, making historical lookups a direct SeekGE instead of an unbounded reverse seek. - Introduce a per-request historical read session that holds one Pebble snapshot plus reusable iterators per store prefix, eliminating the per-read iterator create/destroy overhead. - Add a request-scoped (module, key, version) read cache so repeated hot-key reads inside a single trace skip Pebble entirely. - Add a sentinel-versioned latest-value fast path (math.MaxInt64 per key) usable as a bloom-filter accelerated db.Get when tracing at or past the latest committed height. - Harden batch writes: sort MVCC batches before commit, guard against empty store keys/keys to keep state-sync imports safe, and commit the latest-version metadata key separately so it never violates Pebble's strictly-increasing batch invariant. - Extend iterator navigation to drive MVCC seeks by logical key under the new descending-version encoding, with forward/reverse helpers and defensive skips for the sentinel pointer. Tracing hooks: - Add ReadTraceCollector / ReadTraceEvent / TraceableStateStore types in db_engine/types so callers can optionally observe low-level reads. The MVCC Database implements TraceableStateStore as a no-op when no collector is attached, so this change is zero-cost in the default configuration; the actual consumers live in the follow-up profiling PR. evmrpc: - Add a block-level replay cache keyed by block hash + tx index so a debug trace request that replays txs [0..N) against the historical state reuses the post-tx context from earlier calls in the same block instead of replaying from scratch. No migration is required: the new on-disk version encoding is applied on fresh state-synced data, and the latest-value sentinel is additive. Made-with: Cursor --- evmrpc/simulate.go | 49 +- sei-db/db_engine/pebbledb/mvcc/batch.go | 142 +++- sei-db/db_engine/pebbledb/mvcc/comparator.go | 20 +- sei-db/db_engine/pebbledb/mvcc/db.go | 697 +++++++++++++++++-- sei-db/db_engine/pebbledb/mvcc/iterator.go | 340 +++++---- sei-db/db_engine/types/types.go | 29 + 6 files changed, 996 insertions(+), 281 deletions(-) diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index 8da2dd2f1c..7621f69fb8 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -230,6 +230,8 @@ type Backend struct { globalBlockCache BlockCache cacheCreationMutex *sync.Mutex watermarks *WatermarkManager + replayStateCacheMu sync.RWMutex + replayStateCache map[string]map[int]sdk.Context } func NewBackend( @@ -257,6 +259,7 @@ func NewBackend( globalBlockCache: globalBlockCache, cacheCreationMutex: cacheCreationMutex, watermarks: watermarks, + replayStateCache: map[string]map[int]sdk.Context{}, } } @@ -490,7 +493,19 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype if txIndex < 0 { return state.NewDBImpl(sdkCtx.WithIsEVM(true), b.keeper, true), tmBlock.Block.Txs, nil } + + startIdx := 0 + if cachedCtx, cachedIdx, ok := b.getReplayState(block.Hash().Hex(), txIndex); ok { + sdkCtx = sdkCtx.WithMultiStore(cachedCtx.MultiStore()) + startIdx = cachedIdx + 1 + } else { + b.putReplayState(block.Hash().Hex(), -1, sdkCtx.WithTraceMode(true)) + } + for idx, tx := range tmBlock.Block.Txs { + if idx < startIdx { + continue + } if idx > txIndex { break } @@ -503,7 +518,39 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype } _ = b.app.DeliverTx(sdkCtx, abci.RequestDeliverTxV2{Tx: tx}, sdkTx, sha256.Sum256(tx)) } - return state.NewDBImpl(sdkCtx.WithIsEVM(true), b.keeper, true), tmBlock.Block.Txs, nil + finalCtx := sdkCtx.WithIsEVM(true) + b.putReplayState(block.Hash().Hex(), txIndex, finalCtx.WithTraceMode(true)) + return state.NewDBImpl(finalCtx, b.keeper, true), tmBlock.Block.Txs, nil +} + +func (b *Backend) getReplayState(blockHash string, txIndex int) (sdk.Context, int, bool) { + b.replayStateCacheMu.RLock() + defer b.replayStateCacheMu.RUnlock() + blockStates, ok := b.replayStateCache[blockHash] + if !ok { + return sdk.Context{}, 0, false + } + bestIdx := math.MinInt + var bestCtx sdk.Context + for idx, ctx := range blockStates { + if idx <= txIndex && idx > bestIdx { + bestIdx = idx + bestCtx = ctx + } + } + if bestIdx == math.MinInt { + return sdk.Context{}, 0, false + } + return bestCtx, bestIdx, true +} + +func (b *Backend) putReplayState(blockHash string, txIndex int, ctx sdk.Context) { + b.replayStateCacheMu.Lock() + defer b.replayStateCacheMu.Unlock() + if _, ok := b.replayStateCache[blockHash]; !ok { + b.replayStateCache[blockHash] = map[int]sdk.Context{} + } + b.replayStateCache[blockHash][txIndex] = ctx } func (b *Backend) StateAtBlock(ctx context.Context, block *ethtypes.Block, reexec uint64, base vm.StateDB, readOnly bool, preferDisk bool) (vm.StateDB, tracers.StateReleaseFunc, error) { diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index 6d6dfcd11f..fd4f88fb17 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -4,6 +4,7 @@ import ( "context" "encoding/binary" "fmt" + "sort" "time" "github.com/cockroachdb/pebble/v2" @@ -14,46 +15,50 @@ import ( type Batch struct { storage *pebble.DB - batch *pebble.Batch version int64 + ops []batchOp +} + +type batchOp struct { + key []byte + value []byte + delete bool + order int } func NewBatch(storage *pebble.DB, version int64) (*Batch, error) { if version < 0 { return nil, fmt.Errorf("version must be non-negative") } - var versionBz [VersionSize]byte - binary.LittleEndian.PutUint64(versionBz[:], uint64(version)) - - batch := storage.NewBatch() - - if err := batch.Set([]byte(latestVersionKey), versionBz[:], nil); err != nil { - return nil, fmt.Errorf("failed to write PebbleDB batch: %w", err) - } - - return &Batch{ + b := &Batch{ storage: storage, - batch: batch, version: version, - }, nil + ops: make([]batchOp, 0, 16), + } + return b, nil } func (b *Batch) Size() int { - return b.batch.Len() + return len(b.ops) } func (b *Batch) Reset() { - b.batch.Reset() + b.ops = b.ops[:0] } func (b *Batch) set(storeKey string, tombstone int64, key, value []byte) error { prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), b.version) prefixedVal := MVCCEncode(value, tombstone) - if err := b.batch.Set(prefixedKey, prefixedVal, nil); err != nil { - return fmt.Errorf("failed to write PebbleDB batch: %w", err) + b.appendSet(prefixedKey, prefixedVal) + // Also write a latest-version pointer at the sentinel version for fast + // direct-Get reads at recent heights. The sentinel is a reserved real + // MVCC version (math.MaxInt64), so the custom comparer parses it exactly + // like any other historical entry and no separate keyspace is introduced. + if storeKey != "" && len(key) > 0 { + latestPtrKey := MVCCEncode(prependStoreKey(storeKey, key), latestPointerVersion) + b.appendSet(latestPtrKey, encodeLatestPointerValue(b.version, prefixedVal)) } - return nil } @@ -67,10 +72,9 @@ func (b *Batch) Delete(storeKey string, key []byte) error { func (b *Batch) Write() (err error) { startTime := time.Now() - batchSize := int64(b.batch.Len()) + batchSize := int64(len(b.ops)) defer func() { - err = errors.Join(err, b.batch.Close()) ctx := context.Background() otelMetrics.batchWriteLatency.Record( ctx, @@ -83,40 +87,63 @@ func (b *Batch) Write() (err error) { ) }() - return b.batch.Commit(defaultWriteOpts) + batch := b.storage.NewBatch() + defer func() { + err = errors.Join(err, batch.Close()) + }() + sortBatchOps(b.ops) + for _, op := range b.ops { + if op.delete { + if e := batch.Delete(op.key, nil); e != nil { + return fmt.Errorf("failed to delete in PebbleDB batch: %w", e) + } + continue + } + if e := batch.Set(op.key, op.value, nil); e != nil { + return fmt.Errorf("failed to write PebbleDB batch: %w", e) + } + } + if err := batch.Commit(defaultWriteOpts); err != nil { + return err + } + var versionBz [VersionSize]byte + binary.LittleEndian.PutUint64(versionBz[:], uint64(b.version)) + if err := b.storage.Set([]byte(latestVersionKey), versionBz[:], defaultWriteOpts); err != nil { + return fmt.Errorf("failed to update latest version after batch commit: %w", err) + } + return nil } // For writing kv pairs in any order of version type RawBatch struct { storage *pebble.DB - batch *pebble.Batch + ops []batchOp } func NewRawBatch(storage *pebble.DB) (*RawBatch, error) { - batch := storage.NewBatch() - return &RawBatch{ storage: storage, - batch: batch, + ops: make([]batchOp, 0, 16), }, nil } func (b *RawBatch) Size() int { - return b.batch.Len() + return len(b.ops) } func (b *RawBatch) Reset() { - b.batch.Reset() + b.ops = b.ops[:0] } func (b *RawBatch) set(storeKey string, tombstone int64, key, value []byte, version int64) error { prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), version) prefixedVal := MVCCEncode(value, tombstone) - if err := b.batch.Set(prefixedKey, prefixedVal, nil); err != nil { - return fmt.Errorf("failed to write PebbleDB batch: %w", err) + b.appendSet(prefixedKey, prefixedVal) + if storeKey != "" && len(key) > 0 { + latestPtrKey := MVCCEncode(prependStoreKey(storeKey, key), latestPointerVersion) + b.appendSet(latestPtrKey, encodeLatestPointerValue(version, prefixedVal)) } - return nil } @@ -132,17 +159,14 @@ func (b *RawBatch) Delete(storeKey string, key []byte, version int64) error { // and calling the underlying pebble.Batch.Delete. func (b *Batch) HardDelete(storeKey string, key []byte) error { fullKey := MVCCEncode(prependStoreKey(storeKey, key), b.version) - if err := b.batch.Delete(fullKey, nil); err != nil { - return fmt.Errorf("failed to hard delete key: %w", err) - } + b.appendDelete(fullKey) return nil } func (b *RawBatch) Write() (err error) { startTime := time.Now() - batchSize := int64(b.batch.Len()) + batchSize := int64(len(b.ops)) defer func() { - err = errors.Join(err, b.batch.Close()) ctx := context.Background() otelMetrics.batchWriteLatency.Record( ctx, @@ -155,5 +179,51 @@ func (b *RawBatch) Write() (err error) { ) }() - return b.batch.Commit(defaultWriteOpts) + batch := b.storage.NewBatch() + defer func() { + err = errors.Join(err, batch.Close()) + }() + sortBatchOps(b.ops) + for _, op := range b.ops { + if op.delete { + if e := batch.Delete(op.key, nil); e != nil { + return fmt.Errorf("failed to delete in PebbleDB batch: %w", e) + } + continue + } + if e := batch.Set(op.key, op.value, nil); e != nil { + return fmt.Errorf("failed to write PebbleDB batch: %w", e) + } + } + return batch.Commit(defaultWriteOpts) +} + +func (b *Batch) appendSet(key, value []byte) { + b.ops = append(b.ops, batchOp{ + key: append([]byte(nil), key...), + value: append([]byte(nil), value...), + order: len(b.ops), + }) +} + +func (b *Batch) appendDelete(key []byte) { + b.ops = append(b.ops, batchOp{ + key: append([]byte(nil), key...), + delete: true, + order: len(b.ops), + }) +} + +func (b *RawBatch) appendSet(key, value []byte) { + b.ops = append(b.ops, batchOp{ + key: append([]byte(nil), key...), + value: append([]byte(nil), value...), + order: len(b.ops), + }) +} + +func sortBatchOps(ops []batchOp) { + sort.SliceStable(ops, func(i, j int) bool { + return MVCCComparer.Compare(ops[i].key, ops[j].key) < 0 + }) } diff --git a/sei-db/db_engine/pebbledb/mvcc/comparator.go b/sei-db/db_engine/pebbledb/mvcc/comparator.go index 1e36ecdf92..a6b8afab61 100644 --- a/sei-db/db_engine/pebbledb/mvcc/comparator.go +++ b/sei-db/db_engine/pebbledb/mvcc/comparator.go @@ -134,7 +134,7 @@ type mvccKeyFormatter struct { func (f mvccKeyFormatter) Format(s fmt.State, verb rune) { k, vBz, ok := SplitMVCCKey(f.key) if ok { - v, _ := decodeUint64Ascending(vBz) + v, _ := decodeUint64Descending(vBz) _, _ = fmt.Fprintf(s, "%s/%d", k, v) } else { _, _ = fmt.Fprintf(s, "%s", f.key) @@ -215,17 +215,17 @@ func MVCCEncode(key []byte, version int64) (dst []byte) { if version > 0 { extra := byte(1 + 8) - dst = encodeUint64Ascending(dst, uint64(version)) + dst = encodeUint64Descending(dst, uint64(version)) dst = append(dst, extra) } return dst } -// encodeUint64Ascending encodes the uint64 value using a big-endian 8 byte -// representation. The bytes are appended to the supplied buffer and -// the final buffer is returned. -func encodeUint64Ascending(dst []byte, v uint64) []byte { +// encodeUint64Descending encodes the uint64 value in descending order so newer +// versions sort before older versions for the same logical key. +func encodeUint64Descending(dst []byte, v uint64) []byte { + v = ^v return append( dst, byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32), @@ -233,15 +233,15 @@ func encodeUint64Ascending(dst []byte, v uint64) []byte { ) } -// decodeUint64Ascending decodes a int64 from the input buffer, treating -// the input as a big-endian 8 byte uint64 representation. The decoded int64 is -// returned. -func decodeUint64Ascending(b []byte) (int64, error) { +// decodeUint64Descending decodes a descending-encoded int64 from the input +// buffer and returns the original ascending version value. +func decodeUint64Descending(b []byte) (int64, error) { if len(b) < 8 { return 0, fmt.Errorf("insufficient bytes to decode uint64 int value; expected 8; got %d", len(b)) } uv := binary.BigEndian.Uint64(b) + uv = ^uv if uv > math.MaxInt64 { return 0, fmt.Errorf("uint64 value overflows int64: %d", uv) } diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 60581eb3c2..a3c8dd4e37 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -6,6 +6,7 @@ import ( "encoding/binary" "errors" "fmt" + "io" "math" "strings" "sync" @@ -37,6 +38,14 @@ const ( earliestVersionKey = "s/_earliest" tombstoneVal = "TOMBSTONE" + // latestPointerVersion is a reserved MVCC version used to store a per-key + // "latest value" pointer alongside normal historical entries. Real block + // heights are always far below math.MaxInt64, so this version is never a + // real entry. Under descending version encoding it sorts first per key, + // which lets a direct db.Get(MVCCEncode(key, latestPointerVersion)) serve + // as a bloom-filter accelerated fast path. + latestPointerVersion = int64(math.MaxInt64) + // TODO: Make configurable ImportCommitBatchSize = 10000 PruneCommitBatchSize = 50 @@ -45,11 +54,41 @@ const ( ) var ( - _ types.StateStore = (*Database)(nil) + _ types.StateStore = (*Database)(nil) + _ types.TraceableStateStore = (*Database)(nil) defaultWriteOpts = pebble.NoSync ) +type tracedDatabase struct { + *Database + collector types.ReadTraceCollector + readSession *historicalReadSession +} + +type readTraceCloserRegistry interface { + AddReadTraceCloser(io.Closer) +} + +type historicalReadSession struct { + snapshot *pebble.Snapshot + iterators map[string]*pebble.Iterator + cache map[historicalReadCacheKey]historicalReadCacheValue + mu sync.Mutex + closed bool +} + +type historicalReadCacheKey struct { + storeKey string + version int64 + key string +} + +type historicalReadCacheValue struct { + value []byte + found bool +} + type Database struct { storage *pebble.DB asyncWriteWG sync.WaitGroup @@ -304,11 +343,23 @@ func retrieveEarliestVersion(db *pebble.DB) (int64, error) { } func (db *Database) Has(storeKey string, version int64, key []byte) (bool, error) { + return db.hasWithCollector(storeKey, version, key, nil) +} + +func (db *Database) hasWithCollector(storeKey string, version int64, key []byte, collector types.ReadTraceCollector) (bool, error) { + start := time.Now() + defer recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "has", + DurationNanos: time.Since(start).Nanoseconds(), + Key: slices.Clone(key), + }) if version < db.GetEarliestVersion() { return false, nil } - val, err := db.Get(storeKey, version, key) + val, err := db.getWithCollector(storeKey, version, key, collector) if err != nil { return false, err } @@ -317,6 +368,10 @@ func (db *Database) Has(storeKey string, version int64, key []byte) (bool, error } func (db *Database) Get(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { + return db.getWithCollector(storeKey, targetVersion, key, nil) +} + +func (db *Database) getWithCollector(storeKey string, targetVersion int64, key []byte, collector types.ReadTraceCollector) (_ []byte, _err error) { startTime := time.Now() defer func() { otelMetrics.getLatency.Record( @@ -327,12 +382,28 @@ func (db *Database) Get(storeKey string, targetVersion int64, key []byte) (_ []b attribute.String("store", storeKey), ), ) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "get", + DurationNanos: time.Since(startTime).Nanoseconds(), + Key: slices.Clone(key), + }) }() if targetVersion < db.GetEarliestVersion() { return nil, nil } - prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion) + if val, found, err := getLatestPointerValue( + func(k []byte) ([]byte, io.Closer, error) { return db.storage.Get(k) }, + storeKey, key, targetVersion, db.GetEarliestVersion(), db.config.KeepLastVersion, + ); err != nil { + return nil, err + } else if found { + return val, nil + } + + prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion, collector) if err != nil { if errors.Is(err, errorutils.ErrRecordNotFound) { return nil, nil @@ -341,30 +412,7 @@ func (db *Database) Get(storeKey string, targetVersion int64, key []byte) (_ []b return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) } - valBz, tombBz, ok := SplitMVCCKey(prefixedVal) - if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) - } - - // A tombstone of zero or a target version that is less than the tombstone - // version means the key is not deleted at the target version. - if len(tombBz) == 0 { - return valBz, nil - } - - tombstone, err := decodeUint64Ascending(tombBz) - if err != nil { - return nil, fmt.Errorf("failed to decode value tombstone: %w", err) - } - - // A tombstone of zero or a target version that is less than the tombstone - // version means the key is not deleted at the target version. - if targetVersion < tombstone { - return valBz, nil - } - - // the value is considered deleted - return nil, nil + return visibleValueAtVersion(prefixedVal, targetVersion) } func (db *Database) ApplyChangesetSync(version int64, changeset []*proto.NamedChangeSet) (_err error) { @@ -504,10 +552,10 @@ func (db *Database) Prune(version int64) (_err error) { defer func() { _ = batch.Close() }() var ( - counter int - prevKey, prevKeyEncoded, prevValEncoded []byte - prevVersionDecoded int64 - prevStore string + counter int + prevKey []byte + prevVersionDecoded int64 + prevStore string ) for itr.First(); itr.Valid(); { @@ -543,11 +591,18 @@ func (db *Database) Prune(version int64) (_err error) { } } - currVersionDecoded, err := decodeUint64Ascending(currVersion) + currVersionDecoded, err := decodeUint64Descending(currVersion) if err != nil { return err } + // Skip the sentinel latest-pointer entry; prune only operates on real + // historical versions. The pointer is kept live and updated by writes. + if currVersionDecoded == latestPointerVersion { + itr.Next() + continue + } + // Seek to next key if we are at a version which is higher than prune height // Do not seek to next key if KeepLastVersion is false and we need to delete the previous key in pruning if currVersionDecoded > version && (db.config.KeepLastVersion || prevVersionDecoded > version) { @@ -555,11 +610,12 @@ func (db *Database) Prune(version int64) (_err error) { continue } - // Delete a key if another entry for that key exists at a larger version than original but leq to the prune height - // Also delete a key if it has been tombstoned and its version is leq to the prune height - // Also delete a key if KeepLastVersion is false and version is leq to the prune height - if prevVersionDecoded <= version && (bytes.Equal(prevKey, currKey) || valTombstoned(prevValEncoded) || !db.config.KeepLastVersion) { - err = batch.Delete(prevKeyEncoded, nil) + // With descending MVCC ordering, the first version seen for a logical key is + // the newest one. Any later version for the same key is older and can be + // pruned once it falls below the prune height. If KeepLastVersion is false, + // even the first/only version at or below the prune height can be deleted. + if currVersionDecoded <= version && (bytes.Equal(prevKey, currKey) || !db.config.KeepLastVersion) { + err = batch.Delete(currKeyEncoded, nil) if err != nil { return err } @@ -579,8 +635,6 @@ func (db *Database) Prune(version int64) (_err error) { // Update prevKey and prevVersion for next iteration prevKey = currKey prevVersionDecoded = currVersionDecoded - prevKeyEncoded = currKeyEncoded - prevValEncoded = slices.Clone(itr.Value()) itr.Next() } @@ -597,6 +651,10 @@ func (db *Database) Prune(version int64) (_err error) { } func (db *Database) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return db.iteratorWithCollector(storeKey, version, start, end, nil) +} + +func (db *Database) iteratorWithCollector(storeKey string, version int64, start, end []byte, collector types.ReadTraceCollector) (types.DBIterator, error) { if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { return nil, errorutils.ErrKeyEmpty } @@ -610,14 +668,25 @@ func (db *Database) Iterator(storeKey string, version int64, start, end []byte) var upperBound []byte if end != nil { upperBound = MVCCEncode(prependStoreKey(storeKey, end), 0) + } else { + upperBound = iteratorUpperBoundForStore(storeKey) } + iterStart := time.Now() itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) if err != nil { return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) } + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "newIter", + DurationNanos: time.Since(iterStart).Nanoseconds(), + Start: slices.Clone(lowerBound), + End: slices.Clone(upperBound), + }) - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey, collector), nil } // Taken from pebbledb prefix upper bound @@ -635,6 +704,10 @@ func prefixEnd(b []byte) []byte { } func (db *Database) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return db.reverseIteratorWithCollector(storeKey, version, start, end, nil) +} + +func (db *Database) reverseIteratorWithCollector(storeKey string, version int64, start, end []byte, collector types.ReadTraceCollector) (types.DBIterator, error) { if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { return nil, errorutils.ErrKeyEmpty } @@ -652,12 +725,22 @@ func (db *Database) ReverseIterator(storeKey string, version int64, start, end [ upperBound = MVCCEncode(prefixEnd(storePrefix(storeKey)), 0) } + iterStart := time.Now() itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) if err != nil { return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) } + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "newIter", + DurationNanos: time.Since(iterStart).Nanoseconds(), + Start: slices.Clone(lowerBound), + End: slices.Clone(upperBound), + Reverse: true, + }) - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey, collector), nil } // Import loads the initial version of the state in parallel with numWorkers goroutines @@ -685,6 +768,9 @@ func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err er var counter int for entry := range ch { + if entry.StoreKey == "" || len(entry.Key) == 0 { + continue + } err := batch.Set(entry.StoreKey, entry.Key, entry.Value) if err != nil { panic(err) @@ -754,11 +840,17 @@ func (db *Database) RawIterate(storeKey string, fn func(key []byte, value []byte // Parse prefix out of the key parsedKey := currKey[len(prefix):] - currVersionDecoded, err := decodeUint64Ascending(currVersion) + currVersionDecoded, err := decodeUint64Descending(currVersion) if err != nil { return false, err } + // Skip the sentinel latest-pointer entry; it is a derived index, not + // real data, and its value has a different encoding. + if currVersionDecoded == latestPointerVersion { + continue + } + // Decode the value currValEncoded := itr.Value() if valTombstoned(currValEncoded) { @@ -857,41 +949,534 @@ func parseStoreKey(key []byte) (string, error) { return keyStr[LenPrefixStore : LenPrefixStore+slashIndex], nil } -func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64) ([]byte, error) { - // end domain is exclusive, so we need to increment the version by 1 - if version < math.MaxInt64 { - version++ - } - +func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) ([]byte, error) { + totalStart := time.Now() + defer func() { + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "getMVCCSlice", + DurationNanos: time.Since(totalStart).Nanoseconds(), + Key: slices.Clone(key), + }) + }() + prefixedKey := prependStoreKey(storeKey, key) + seekKey := MVCCEncode(prefixedKey, version) + lowerBound := seekKey + upperBound := iteratorUpperBoundForLogicalKey(prefixedKey) + iterStart := time.Now() itr, err := db.NewIter(&pebble.IterOptions{ - LowerBound: MVCCEncode(prependStoreKey(storeKey, key), 0), - UpperBound: MVCCEncode(prependStoreKey(storeKey, key), version), + LowerBound: lowerBound, + UpperBound: upperBound, }) if err != nil { return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) } + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "newIter", + DurationNanos: time.Since(iterStart).Nanoseconds(), + Start: slices.Clone(lowerBound), + End: slices.Clone(upperBound), + Reverse: true, + }) defer func() { + closeStart := time.Now() err = errorutils.Join(err, itr.Close()) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "iterClose", + DurationNanos: time.Since(closeStart).Nanoseconds(), + Key: slices.Clone(key), + Reverse: true, + }) }() - if !itr.Last() { + firstStart := time.Now() + firstOK := itr.First() + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "first", + DurationNanos: time.Since(firstStart).Nanoseconds(), + Key: slices.Clone(key), + }) + if !firstOK { return nil, errorutils.ErrRecordNotFound } - _, vBz, ok := SplitMVCCKey(itr.Key()) + keyReadStart := time.Now() + rawIterKey := slices.Clone(itr.Key()) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "iterKey", + DurationNanos: time.Since(keyReadStart).Nanoseconds(), + Key: rawIterKey, + Reverse: true, + }) + + splitKeyStart := time.Now() + userKey, vBz, ok := SplitMVCCKey(rawIterKey) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "splitKey", + DurationNanos: time.Since(splitKeyStart).Nanoseconds(), + Key: rawIterKey, + }) if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", itr.Key()) + return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) + } + if !bytes.Equal(userKey, prefixedKey) { + return nil, errorutils.ErrRecordNotFound } - keyVersion, err := decodeUint64Ascending(vBz) + decodeVersionStart := time.Now() + keyVersion, err := decodeUint64Descending(vBz) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "decodeKeyVersion", + DurationNanos: time.Since(decodeVersionStart).Nanoseconds(), + Key: rawIterKey, + }) if err != nil { return nil, fmt.Errorf("failed to decode key version: %w", err) } if keyVersion > version { - return nil, fmt.Errorf("key version too large: %d", keyVersion) + return nil, errorutils.ErrRecordNotFound + } + + valueReadStart := time.Now() + rawIterValue := itr.Value() + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "iterValue", + DurationNanos: time.Since(valueReadStart).Nanoseconds(), + Key: rawIterKey, + Reverse: true, + }) + + valueCloneStart := time.Now() + clonedValue := slices.Clone(rawIterValue) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "cloneValue", + DurationNanos: time.Since(valueCloneStart).Nanoseconds(), + Key: rawIterKey, + }) + + return clonedValue, nil +} + +func getMVCCSliceWithSession(session *historicalReadSession, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) ([]byte, error) { + totalStart := time.Now() + defer func() { + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "getMVCCSlice", + DurationNanos: time.Since(totalStart).Nanoseconds(), + Key: slices.Clone(key), + }) + }() + + prefixedKey := prependStoreKey(storeKey, key) + seekKey := MVCCEncode(prefixedKey, version) + + itr, created, iterDuration, err := session.getOrCreateIterator(storeKey) + if err != nil { + return nil, err + } + if created { + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "newIter", + DurationNanos: iterDuration.Nanoseconds(), + Start: slices.Clone(MVCCEncode(prependStoreKey(storeKey, nil), 0)), + End: slices.Clone(iteratorUpperBoundForStore(storeKey)), + }) + } + + seekStart := time.Now() + session.mu.Lock() + ok := itr.SeekGE(seekKey) + var ( + rawIterKey []byte + rawIterValue []byte + ) + if ok { + rawIterKey = slices.Clone(itr.Key()) + rawIterValue = slices.Clone(itr.Value()) + } + iterErr := itr.Error() + session.mu.Unlock() + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "seekGE", + DurationNanos: time.Since(seekStart).Nanoseconds(), + Key: slices.Clone(seekKey), + }) + if iterErr != nil { + return nil, iterErr + } + if !ok { + return nil, errorutils.ErrRecordNotFound + } + + keyReadStart := time.Now() + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "iterKey", + DurationNanos: time.Since(keyReadStart).Nanoseconds(), + Key: rawIterKey, + Reverse: true, + }) + + splitKeyStart := time.Now() + userKey, vBz, ok := SplitMVCCKey(rawIterKey) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "splitKey", + DurationNanos: time.Since(splitKeyStart).Nanoseconds(), + Key: rawIterKey, + }) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) + } + if !bytes.Equal(userKey, prefixedKey) { + return nil, errorutils.ErrRecordNotFound + } + + decodeVersionStart := time.Now() + keyVersion, err := decodeUint64Descending(vBz) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "decodeKeyVersion", + DurationNanos: time.Since(decodeVersionStart).Nanoseconds(), + Key: rawIterKey, + }) + if err != nil { + return nil, fmt.Errorf("failed to decode key version: %w", err) + } + if keyVersion > version { + return nil, errorutils.ErrRecordNotFound + } + + valueReadStart := time.Now() + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "pebble", + Operation: "iterValue", + DurationNanos: time.Since(valueReadStart).Nanoseconds(), + Key: rawIterKey, + Reverse: true, + }) + + valueCloneStart := time.Now() + clonedValue := slices.Clone(rawIterValue) + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "cloneValue", + DurationNanos: time.Since(valueCloneStart).Nanoseconds(), + Key: rawIterKey, + }) + + return clonedValue, nil +} + +func (db *Database) WithReadTraceCollector(collector types.ReadTraceCollector) types.StateStore { + if collector == nil { + return db + } + session := newHistoricalReadSession(db.storage) + traced := &tracedDatabase{Database: db, collector: collector, readSession: session} + if registry, ok := collector.(readTraceCloserRegistry); ok { + registry.AddReadTraceCloser(session) } + return traced +} + +func (db *tracedDatabase) Get(storeKey string, version int64, key []byte) ([]byte, error) { + return db.getWithSession(storeKey, version, key) +} + +func (db *tracedDatabase) Has(storeKey string, version int64, key []byte) (bool, error) { + start := time.Now() + defer recordReadTrace(db.collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "has", + DurationNanos: time.Since(start).Nanoseconds(), + Key: slices.Clone(key), + }) + val, err := db.getWithSession(storeKey, version, key) + if err != nil { + return false, err + } + return val != nil, nil +} + +func (db *tracedDatabase) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return db.Database.iteratorWithCollector(storeKey, version, start, end, db.collector) +} + +func (db *tracedDatabase) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return db.Database.reverseIteratorWithCollector(storeKey, version, start, end, db.collector) +} + +func recordReadTrace(collector types.ReadTraceCollector, event types.ReadTraceEvent) { + if collector == nil { + return + } + collector.RecordReadTrace(event) +} + +func (db *tracedDatabase) getWithSession(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { + startTime := time.Now() + defer func() { + otelMetrics.getLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + attribute.String("store", storeKey), + ), + ) + recordReadTrace(db.collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "get", + DurationNanos: time.Since(startTime).Nanoseconds(), + Key: slices.Clone(key), + }) + }() + if targetVersion < db.GetEarliestVersion() { + return nil, nil + } + + if val, found, err := getLatestPointerValue( + func(k []byte) ([]byte, io.Closer, error) { return db.readSession.snapshot.Get(k) }, + storeKey, key, targetVersion, db.GetEarliestVersion(), db.config.KeepLastVersion, + ); err != nil { + return nil, err + } else if found { + db.readSession.store(storeKey, targetVersion, key, val) + return val, nil + } + + if val, found := db.readSession.lookup(storeKey, targetVersion, key); found { + recordReadTrace(db.collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "readCacheHit", + DurationNanos: 0, + Key: slices.Clone(key), + }) + return val, nil + } + recordReadTrace(db.collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "readCacheMiss", + DurationNanos: 0, + Key: slices.Clone(key), + }) + + prefixedVal, err := getMVCCSliceWithSession(db.readSession, storeKey, key, targetVersion, db.collector) + if err != nil { + if errors.Is(err, errorutils.ErrRecordNotFound) { + db.readSession.store(storeKey, targetVersion, key, nil) + return nil, nil + } + return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) + } + + val, err := visibleValueAtVersion(prefixedVal, targetVersion) + if err != nil { + return nil, err + } + db.readSession.store(storeKey, targetVersion, key, val) + return val, nil +} + +func visibleValueAtVersion(prefixedVal []byte, targetVersion int64) ([]byte, error) { + valBz, tombBz, ok := SplitMVCCKey(prefixedVal) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) + } + if len(tombBz) == 0 { + return valBz, nil + } + tombstone, err := decodeUint64Descending(tombBz) + if err != nil { + return nil, fmt.Errorf("failed to decode value tombstone: %w", err) + } + if targetVersion < tombstone { + return valBz, nil + } + return nil, nil +} + +func newHistoricalReadSession(db *pebble.DB) *historicalReadSession { + session := &historicalReadSession{ + snapshot: db.NewSnapshot(), + iterators: map[string]*pebble.Iterator{}, + cache: map[historicalReadCacheKey]historicalReadCacheValue{}, + } + return session +} + +func (s *historicalReadSession) lookup(storeKey string, version int64, key []byte) ([]byte, bool) { + s.mu.Lock() + defer s.mu.Unlock() + entry, ok := s.cache[historicalReadCacheKey{storeKey: storeKey, version: version, key: string(key)}] + if !ok { + return nil, false + } + if !entry.found { + return nil, true + } + return slices.Clone(entry.value), true +} - return slices.Clone(itr.Value()), nil +func (s *historicalReadSession) store(storeKey string, version int64, key []byte, value []byte) { + s.mu.Lock() + defer s.mu.Unlock() + cacheValue := historicalReadCacheValue{found: value != nil} + if value != nil { + cacheValue.value = slices.Clone(value) + } + s.cache[historicalReadCacheKey{storeKey: storeKey, version: version, key: string(key)}] = cacheValue +} + +func (s *historicalReadSession) getOrCreateIterator(storeKey string) (*pebble.Iterator, bool, time.Duration, error) { + s.mu.Lock() + defer s.mu.Unlock() + if itr, ok := s.iterators[storeKey]; ok { + return itr, false, 0, nil + } + start := time.Now() + itr, err := s.snapshot.NewIter(&pebble.IterOptions{ + LowerBound: MVCCEncode(prependStoreKey(storeKey, nil), 0), + UpperBound: iteratorUpperBoundForStore(storeKey), + }) + if err != nil { + return nil, false, 0, err + } + s.iterators[storeKey] = itr + return itr, true, time.Since(start), nil +} + +func (s *historicalReadSession) Close() error { + s.mu.Lock() + defer s.mu.Unlock() + if s.closed { + return nil + } + s.closed = true + var lastErr error + for _, itr := range s.iterators { + if err := itr.Close(); err != nil { + lastErr = err + } + } + if s.snapshot != nil { + if err := s.snapshot.Close(); err != nil { + lastErr = err + } + } + s.iterators = nil + s.cache = nil + s.snapshot = nil + return lastErr +} + +func iteratorUpperBoundForStore(storeKey string) []byte { + upperStorePrefix := prefixEnd(storePrefix(storeKey)) + if upperStorePrefix == nil { + return nil + } + return MVCCEncode(upperStorePrefix, 0) +} + +func iteratorUpperBoundForLogicalKey(key []byte) []byte { + upperKeyPrefix := prefixEnd(key) + if upperKeyPrefix == nil { + return nil + } + return MVCCEncode(upperKeyPrefix, 0) +} + +// encodeLatestPointerValue stores the write version alongside the actual +// prefixed MVCC value so readers can both validate the pointer is visible at +// their requested version and decode tombstone/value info. +func encodeLatestPointerValue(version int64, prefixedVal []byte) []byte { + var versionBz [VersionSize]byte + binary.LittleEndian.PutUint64(versionBz[:], uint64(version)) + return append(versionBz[:], prefixedVal...) +} + +func decodeLatestPointerValue(bz []byte) (int64, []byte, error) { + if len(bz) < VersionSize { + return 0, nil, fmt.Errorf("latest pointer entry too short: %d", len(bz)) + } + v := binary.LittleEndian.Uint64(bz[:VersionSize]) + if v > math.MaxInt64 { + return 0, nil, fmt.Errorf("latest pointer version overflows int64: %d", v) + } + return int64(v), bz[VersionSize:], nil +} + +// getLatestPointerValue is the fast-path read: a single bloom-filter accelerated +// db.Get. Returns (value, true, nil) only when the pointer is visible at the +// caller's targetVersion. Falls back to the MVCC scan path otherwise. +func getLatestPointerValue( + getter func(key []byte) ([]byte, io.Closer, error), + storeKey string, + key []byte, + targetVersion, earliestVersion int64, + keepLastVersion bool, +) ([]byte, bool, error) { + if storeKey == "" || len(key) == 0 { + return nil, false, nil + } + latestKey := MVCCEncode(prependStoreKey(storeKey, key), latestPointerVersion) + val, closer, err := getter(latestKey) + if err != nil { + if errors.Is(err, pebble.ErrNotFound) { + return nil, false, nil + } + return nil, false, fmt.Errorf("failed latest-pointer lookup: %w", err) + } + defer func() { _ = closer.Close() }() + + latestVersion, prefixedVal, err := decodeLatestPointerValue(utils.Clone(val)) + if err != nil { + return nil, false, err + } + if latestVersion < earliestVersion && !keepLastVersion { + return nil, false, nil + } + if latestVersion > targetVersion { + return nil, false, nil + } + value, err := visibleValueAtVersion(prefixedVal, targetVersion) + if err != nil { + return nil, false, err + } + return value, true, nil } func valTombstoned(value []byte) bool { diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator.go b/sei-db/db_engine/pebbledb/mvcc/iterator.go index a8f2ebf43d..6715e80a86 100644 --- a/sei-db/db_engine/pebbledb/mvcc/iterator.go +++ b/sei-db/db_engine/pebbledb/mvcc/iterator.go @@ -4,7 +4,9 @@ import ( "bytes" "context" "fmt" + "math" "sync" + "time" "github.com/cockroachdb/pebble/v2" "go.opentelemetry.io/otel/attribute" @@ -33,27 +35,30 @@ type iterator struct { reverse bool iterationCount int64 storeKey string + collector types.ReadTraceCollector closeSync sync.Once } -func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version int64, earliestVersion int64, reverse bool, storeKey string) *iterator { +func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version int64, earliestVersion int64, reverse bool, storeKey string, collector types.ReadTraceCollector) *iterator { // Return invalid iterator if requested iterator height is lower than earliest version after pruning if version < earliestVersion { return &iterator{ - source: src, - prefix: prefix, - start: mvccStart, - end: mvccEnd, - version: version, - valid: false, - reverse: reverse, - storeKey: storeKey, + source: src, + prefix: prefix, + start: mvccStart, + end: mvccEnd, + version: version, + valid: false, + reverse: reverse, + storeKey: storeKey, + collector: collector, } } // move the underlying PebbleDB iterator to the first key var valid bool + positionStart := time.Now() if reverse { valid = src.Last() } else { @@ -61,38 +66,32 @@ func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte } itr := &iterator{ - source: src, - prefix: prefix, - start: mvccStart, - end: mvccEnd, - version: version, - valid: valid, - reverse: reverse, - storeKey: storeKey, + source: src, + prefix: prefix, + start: mvccStart, + end: mvccEnd, + version: version, + valid: valid, + reverse: reverse, + storeKey: storeKey, + collector: collector, + } + if reverse { + itr.recordPebbleOp("last", time.Since(positionStart), nil) + } else { + itr.recordPebbleOp("first", time.Since(positionStart), nil) } if valid { - currKey, currKeyVersion, ok := SplitMVCCKey(itr.source.Key()) + currKey, _, ok := SplitMVCCKey(itr.source.Key()) if !ok { // XXX: This should not happen as that would indicate we have a malformed MVCC key. panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) } - - curKeyVersionDecoded, err := decodeUint64Ascending(currKeyVersion) - if err != nil { - itr.valid = false - return itr - } - - // We need to check whether initial key iterator visits has a version <= requested version - // If larger version, call next to find another key which does - if curKeyVersionDecoded > itr.version { - itr.Next() + if reverse { + itr.positionAtOrBeforeKey(currKey) } else { - // If version is less, seek to the largest version of that key <= requested iterator version - // It is guaranteed this won't move the iterator to a key that is invalid since - // curKeyVersionDecoded <= requested iterator version, so there exists at least one version of currKey SeekLT may move to - itr.valid = itr.source.SeekLT(MVCCEncode(currKey, itr.version+1)) + itr.positionAtOrAfterKey(currKey) } } @@ -109,6 +108,99 @@ func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte return itr } +func (itr *iterator) seekVisibleVersionForKey(targetKey []byte) bool { + seekKey := MVCCEncode(targetKey, itr.version) + seekStart := time.Now() + valid := itr.source.SeekGE(seekKey) + itr.recordPebbleOp("seekGE", time.Since(seekStart), seekKey) + if !valid { + return false + } + + foundKey, foundVersion, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + return false + } + if !bytes.Equal(foundKey, targetKey) { + return false + } + foundVersionDecoded, err := decodeUint64Descending(foundVersion) + if err != nil { + return false + } + // Never surface the sentinel latest-pointer entry through iteration. + if foundVersionDecoded == latestPointerVersion { + return false + } + return foundVersionDecoded <= itr.version +} + +func (itr *iterator) nextLogicalKey(currKey []byte) ([]byte, bool) { + nextKeyPrefix := prefixEnd(currKey) + if nextKeyPrefix == nil { + return nil, false + } + seekKey := MVCCEncode(nextKeyPrefix, math.MaxInt64) + seekStart := time.Now() + valid := itr.source.SeekGE(seekKey) + itr.recordPebbleOp("seekGE", time.Since(seekStart), seekKey) + if !valid { + return nil, false + } + nextKey, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok || !bytes.HasPrefix(nextKey, itr.prefix) { + return nil, false + } + return nextKey, true +} + +func (itr *iterator) prevLogicalKey(currKey []byte) ([]byte, bool) { + seekKey := MVCCEncode(currKey, math.MaxInt64) + seekStart := time.Now() + valid := itr.source.SeekLT(seekKey) + itr.recordPebbleOp("seekLT", time.Since(seekStart), seekKey) + if !valid { + return nil, false + } + prevKey, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok || !bytes.HasPrefix(prevKey, itr.prefix) { + return nil, false + } + return prevKey, true +} + +func (itr *iterator) positionAtOrAfterKey(startKey []byte) { + currentKey := startKey + for { + itr.valid = itr.seekVisibleVersionForKey(currentKey) + if itr.valid && !itr.cursorTombstoned() { + return + } + nextKey, ok := itr.nextLogicalKey(currentKey) + if !ok { + itr.valid = false + return + } + currentKey = nextKey + } +} + +func (itr *iterator) positionAtOrBeforeKey(startKey []byte) { + currentKey := startKey + for { + itr.valid = itr.seekVisibleVersionForKey(currentKey) + if itr.valid && !itr.cursorTombstoned() { + return + } + prevKey, ok := itr.prevLogicalKey(currentKey) + if !ok { + itr.valid = false + return + } + currentKey = prevKey + } +} + // Domain returns the domain of the iterator. The caller must not modify the // return values. func (itr *iterator) Domain() ([]byte, []byte) { @@ -154,81 +246,12 @@ func (itr *iterator) nextForward() { // MVCC key. panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) } - - next := itr.source.NextPrefix() - - // First move the iterator to the next prefix, which may not correspond to the - // desired version for that key, e.g. if the key was written at a later version, - // so we seek back to the latest desired version, s.t. the version is <= itr.version. - if next { - nextKey, _, ok := SplitMVCCKey(itr.source.Key()) - if !ok { - // XXX: This should not happen as that would indicate we have a malformed - // MVCC key. - itr.valid = false - return - } - if !bytes.HasPrefix(nextKey, itr.prefix) { - // the next key must have itr.prefix as the prefix - itr.valid = false - return - } - - // Move the iterator to the closest version to the desired version, so we - // append the current iterator key to the prefix and seek to that key. - itr.valid = itr.source.SeekLT(MVCCEncode(nextKey, itr.version+1)) - - tmpKey, tmpKeyVersion, ok := SplitMVCCKey(itr.source.Key()) - if !ok { - // XXX: This should not happen as that would indicate we have a malformed - // MVCC key. - itr.valid = false - return - } - - // There exists cases where the SeekLT() call moved us back to the same key - // we started at, so we must move to next key, i.e. two keys forward. - if bytes.Equal(tmpKey, currKey) { - if itr.source.NextPrefix() { - itr.nextForward() - - _, tmpKeyVersion, ok = SplitMVCCKey(itr.source.Key()) - if !ok { - // XXX: This should not happen as that would indicate we have a malformed - // MVCC key. - itr.valid = false - return - } - - } else { - itr.valid = false - return - } - } - - // We need to verify that every Next call either moves the iterator to a key whose version - // is less than or equal to requested iterator version, or exhausts the iterator - tmpKeyVersionDecoded, err := decodeUint64Ascending(tmpKeyVersion) - if err != nil { - itr.valid = false - return - } - - // If iterator is at a entry whose version is higher than requested version, call nextForward again - if tmpKeyVersionDecoded > itr.version { - itr.nextForward() - } - - // The cursor might now be pointing at a key/value pair that is tombstoned. - // If so, we must move the cursor. - if itr.valid && itr.cursorTombstoned() { - itr.nextForward() - } - + nextKey, ok := itr.nextLogicalKey(currKey) + if !ok { + itr.valid = false return } - - itr.valid = false + itr.positionAtOrAfterKey(nextKey) } func (itr *iterator) nextReverse() { @@ -244,60 +267,12 @@ func (itr *iterator) nextReverse() { panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) } - next := itr.source.SeekLT(MVCCEncode(currKey, 0)) - - // First move the iterator to the next prefix, which may not correspond to the - // desired version for that key, e.g. if the key was written at a later version, - // so we seek back to the latest desired version, s.t. the version is <= itr.version. - if next { - nextKey, _, ok := SplitMVCCKey(itr.source.Key()) - if !ok { - // XXX: This should not happen as that would indicate we have a malformed - // MVCC key. - itr.valid = false - return - } - if !bytes.HasPrefix(nextKey, itr.prefix) { - // the next key must have itr.prefix as the prefix - itr.valid = false - return - } - - // Move the iterator to the closest version to the desired version, so we - // append the current iterator key to the prefix and seek to that key. - itr.valid = itr.source.SeekLT(MVCCEncode(nextKey, itr.version+1)) - - _, tmpKeyVersion, ok := SplitMVCCKey(itr.source.Key()) - if !ok { - // XXX: This should not happen as that would indicate we have a malformed - // MVCC key. - itr.valid = false - return - } - - // We need to verify that every Next call either moves the iterator to a key whose version - // is less than or equal to requested iterator version, or exhausts the iterator - tmpKeyVersionDecoded, err := decodeUint64Ascending(tmpKeyVersion) - if err != nil { - itr.valid = false - return - } - - // If iterator is at a entry whose version is higher than requested version, call nextReverse again - if tmpKeyVersionDecoded > itr.version { - itr.nextReverse() - } - - // The cursor might now be pointing at a key/value pair that is tombstoned. - // If so, we must move the cursor. - if itr.valid && itr.cursorTombstoned() { - itr.nextReverse() - } - + prevKey, ok := itr.prevLogicalKey(currKey) + if !ok { + itr.valid = false return } - - itr.valid = false + itr.positionAtOrBeforeKey(prevKey) } func (itr *iterator) Next() { @@ -384,7 +359,7 @@ func (itr *iterator) cursorTombstoned() bool { // If the tombstone suffix is non-empty and greater than the target version, // the value is not tombstoned. - tombstone, err := decodeUint64Ascending(tombBz) + tombstone, err := decodeUint64Descending(tombBz) if err != nil { panic(fmt.Errorf("failed to decode value tombstone: %w", err)) } @@ -401,7 +376,8 @@ func (itr *iterator) DebugRawIterate() { // The first key may not represent the desired target version, so move the // cursor to the correct location. firstKey, _, _ := SplitMVCCKey(itr.source.Key()) - valid = itr.source.SeekLT(MVCCEncode(firstKey, itr.version+1)) + itr.positionAtOrAfterKey(firstKey) + valid = itr.valid } for valid { @@ -410,7 +386,7 @@ func (itr *iterator) DebugRawIterate() { panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) } - version, err := decodeUint64Ascending(vBz) + version, err := decodeUint64Descending(vBz) if err != nil { panic(fmt.Errorf("failed to decode key version: %w", err)) } @@ -422,7 +398,7 @@ func (itr *iterator) DebugRawIterate() { var tombstone int64 if len(tombBz) > 0 { - tombstone, err = decodeUint64Ascending(vBz) + tombstone, err = decodeUint64Descending(vBz) if err != nil { panic(fmt.Errorf("failed to decode value tombstone: %w", err)) } @@ -430,27 +406,35 @@ func (itr *iterator) DebugRawIterate() { fmt.Printf("KEY: %s, VALUE: %s, VERSION: %d, TOMBSTONE: %d\n", key, val, version, tombstone) - var next bool if itr.reverse { - next = itr.source.SeekLT(MVCCEncode(key, 0)) - } else { - next = itr.source.NextPrefix() - } - - if next { - nextKey, _, ok := SplitMVCCKey(itr.source.Key()) + prevKey, ok := itr.prevLogicalKey(key) if !ok { - panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) - } - - // the next key must have itr.prefix as the prefix - if !bytes.HasPrefix(nextKey, itr.prefix) { valid = false - } else { - valid = itr.source.SeekLT(MVCCEncode(nextKey, itr.version+1)) + continue } + itr.positionAtOrBeforeKey(prevKey) + valid = itr.valid + continue } else { - valid = false + nextKey, ok := itr.nextLogicalKey(key) + if !ok { + valid = false + continue + } + itr.positionAtOrAfterKey(nextKey) + valid = itr.valid + continue } } } + +func (itr *iterator) recordPebbleOp(operation string, duration time.Duration, key []byte) { + recordReadTrace(itr.collector, types.ReadTraceEvent{ + StoreKey: itr.storeKey, + Layer: "pebble", + Operation: operation, + DurationNanos: duration.Nanoseconds(), + Key: slices.Clone(key), + Reverse: itr.reverse, + }) +} diff --git a/sei-db/db_engine/types/types.go b/sei-db/db_engine/types/types.go index 161017381d..a56366900b 100644 --- a/sei-db/db_engine/types/types.go +++ b/sei-db/db_engine/types/types.go @@ -2,6 +2,7 @@ package types import ( "io" + "time" "github.com/sei-protocol/sei-chain/sei-db/proto" ) @@ -154,6 +155,34 @@ type StateStore interface { io.Closer } +type ReadTraceEvent struct { + StoreKey string + Layer string + Operation string + DurationNanos int64 + Key []byte + Start []byte + End []byte + Reverse bool +} + +type ReadTraceCollector interface { + RecordReadTrace(ReadTraceEvent) +} + +type TraceableStateStore interface { + WithReadTraceCollector(ReadTraceCollector) StateStore +} + +func NewReadTraceEvent(storeKey, layer, operation string, duration time.Duration) ReadTraceEvent { + return ReadTraceEvent{ + StoreKey: storeKey, + Layer: layer, + Operation: operation, + DurationNanos: duration.Nanoseconds(), + } +} + // DBIterator iterates over versioned key-value pairs. type DBIterator interface { Domain() (start []byte, end []byte) From 4a2e3edd9c814122f2ed733b94a4b151e68e5794 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 16 Apr 2026 20:24:33 -0400 Subject: [PATCH 02/15] fix: snapshot replay cache checkpoints before reuse ReplayTransactionTillIndex was storing sdk.Context values whose MultiStore field still pointed at the live replay state. The -1 checkpoint was cached before replay started but then advanced in place as transactions were applied, and later cache hits reused the stored checkpoint directly. Store a branched CacheMultiStore snapshot when caching a checkpoint, and branch again when resuming from one, so each replay starts from an immutable saved state instead of mutating the cached checkpoint itself. Made-with: Cursor --- evmrpc/simulate.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index 7621f69fb8..e593ee7a07 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -496,10 +496,12 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype startIdx := 0 if cachedCtx, cachedIdx, ok := b.getReplayState(block.Hash().Hex(), txIndex); ok { - sdkCtx = sdkCtx.WithMultiStore(cachedCtx.MultiStore()) + // Always replay from a fresh branch of the cached checkpoint so the + // stored snapshot remains immutable across requests. + sdkCtx = sdkCtx.WithMultiStore(cachedCtx.MultiStore().CacheMultiStore()) startIdx = cachedIdx + 1 } else { - b.putReplayState(block.Hash().Hex(), -1, sdkCtx.WithTraceMode(true)) + b.putReplayState(block.Hash().Hex(), -1, snapshotReplayState(sdkCtx.WithTraceMode(true))) } for idx, tx := range tmBlock.Block.Txs { @@ -519,7 +521,7 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype _ = b.app.DeliverTx(sdkCtx, abci.RequestDeliverTxV2{Tx: tx}, sdkTx, sha256.Sum256(tx)) } finalCtx := sdkCtx.WithIsEVM(true) - b.putReplayState(block.Hash().Hex(), txIndex, finalCtx.WithTraceMode(true)) + b.putReplayState(block.Hash().Hex(), txIndex, snapshotReplayState(finalCtx.WithTraceMode(true))) return state.NewDBImpl(finalCtx, b.keeper, true), tmBlock.Block.Txs, nil } @@ -553,6 +555,10 @@ func (b *Backend) putReplayState(blockHash string, txIndex int, ctx sdk.Context) b.replayStateCache[blockHash][txIndex] = ctx } +func snapshotReplayState(ctx sdk.Context) sdk.Context { + return ctx.WithMultiStore(ctx.MultiStore().CacheMultiStore()) +} + func (b *Backend) StateAtBlock(ctx context.Context, block *ethtypes.Block, reexec uint64, base vm.StateDB, readOnly bool, preferDisk bool) (vm.StateDB, tracers.StateReleaseFunc, error) { emptyRelease := func() {} sdkCtx, _, err := b.initializeBlock(ctx, block) From 824f13067c3c973cdce3c1edd4ef4f3bf118889a Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 16 Apr 2026 23:59:32 -0400 Subject: [PATCH 03/15] fix: prune leak under descending MVCC and bound replay state cache sei-db prune: the existing NextPrefix guard was written for ascending version order. Under the new descending encoding the newest version of a key is seen first, so whenever the newest version was above the prune height the iterator skipped to the next logical key and every older version (including ones below the prune height) was leaked on disk. Rewrite the loop around per-logical-key state: on entering a new key with newest > prune, seek straight to the first version <= prune, then keep one version below the prune height (when KeepLastVersion=true) and delete the rest. The fast-path seek preserves the old optimization of skipping above-prune versions. evmrpc replay cache: replayStateCache grew one entry per (block, tx) forever with no eviction, each holding a branched CacheMultiStore. Wrap it in an expirable LRU bounded by block count (32) and TTL (10m); keep per-block mutexes so inner-map updates stay safe under concurrent traces. Regression tests: - sei-db/.../prune_test.go exercises the descending bug directly by reading raw pebble entries after Prune, covering keepLast true/false, mixed above/below-prune keys, and sentinel preservation. They fail on the pre-fix code. - evmrpc/simulate_cache_test.go covers get/put, LRU eviction past the cap, and concurrent access under -race. --- evmrpc/simulate.go | 47 +++++-- evmrpc/simulate_cache_test.go | 95 +++++++++++++ sei-db/db_engine/pebbledb/mvcc/db.go | 64 +++++---- sei-db/db_engine/pebbledb/mvcc/prune_test.go | 137 +++++++++++++++++++ 4 files changed, 301 insertions(+), 42 deletions(-) create mode 100644 evmrpc/simulate_cache_test.go create mode 100644 sei-db/db_engine/pebbledb/mvcc/prune_test.go diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index e593ee7a07..bcc0c8bc3d 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -28,6 +28,7 @@ import ( "github.com/ethereum/go-ethereum/export" "github.com/ethereum/go-ethereum/params" "github.com/ethereum/go-ethereum/rpc" + "github.com/hashicorp/golang-lru/v2/expirable" "github.com/sei-protocol/sei-chain/app/legacyabci" "github.com/sei-protocol/sei-chain/precompiles/wasmd" "github.com/sei-protocol/sei-chain/sei-cosmos/baseapp" @@ -230,10 +231,23 @@ type Backend struct { globalBlockCache BlockCache cacheCreationMutex *sync.Mutex watermarks *WatermarkManager - replayStateCacheMu sync.RWMutex - replayStateCache map[string]map[int]sdk.Context + replayStateCacheMu sync.Mutex + replayStateCache *expirable.LRU[string, *blockReplayState] } +// blockReplayState holds cached replay checkpoints for a single block, keyed +// by tx index. Protected by its own mutex so entries for different blocks +// can be updated independently. +type blockReplayState struct { + mu sync.Mutex + checkpoints map[int]sdk.Context +} + +const ( + replayStateCacheBlocks = 32 + replayStateCacheTTL = 10 * time.Minute +) + func NewBackend( ctxProvider func(int64) sdk.Context, keeper *keeper.Keeper, @@ -259,7 +273,9 @@ func NewBackend( globalBlockCache: globalBlockCache, cacheCreationMutex: cacheCreationMutex, watermarks: watermarks, - replayStateCache: map[string]map[int]sdk.Context{}, + replayStateCache: expirable.NewLRU[string, *blockReplayState]( + replayStateCacheBlocks, nil, replayStateCacheTTL, + ), } } @@ -526,15 +542,15 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype } func (b *Backend) getReplayState(blockHash string, txIndex int) (sdk.Context, int, bool) { - b.replayStateCacheMu.RLock() - defer b.replayStateCacheMu.RUnlock() - blockStates, ok := b.replayStateCache[blockHash] + state, ok := b.replayStateCache.Get(blockHash) if !ok { return sdk.Context{}, 0, false } + state.mu.Lock() + defer state.mu.Unlock() bestIdx := math.MinInt var bestCtx sdk.Context - for idx, ctx := range blockStates { + for idx, ctx := range state.checkpoints { if idx <= txIndex && idx > bestIdx { bestIdx = idx bestCtx = ctx @@ -547,12 +563,19 @@ func (b *Backend) getReplayState(blockHash string, txIndex int) (sdk.Context, in } func (b *Backend) putReplayState(blockHash string, txIndex int, ctx sdk.Context) { - b.replayStateCacheMu.Lock() - defer b.replayStateCacheMu.Unlock() - if _, ok := b.replayStateCache[blockHash]; !ok { - b.replayStateCache[blockHash] = map[int]sdk.Context{} + state, ok := b.replayStateCache.Get(blockHash) + if !ok { + b.replayStateCacheMu.Lock() + state, ok = b.replayStateCache.Get(blockHash) + if !ok { + state = &blockReplayState{checkpoints: map[int]sdk.Context{}} + b.replayStateCache.Add(blockHash, state) + } + b.replayStateCacheMu.Unlock() } - b.replayStateCache[blockHash][txIndex] = ctx + state.mu.Lock() + state.checkpoints[txIndex] = ctx + state.mu.Unlock() } func snapshotReplayState(ctx sdk.Context) sdk.Context { diff --git a/evmrpc/simulate_cache_test.go b/evmrpc/simulate_cache_test.go new file mode 100644 index 0000000000..48ddcb1ac9 --- /dev/null +++ b/evmrpc/simulate_cache_test.go @@ -0,0 +1,95 @@ +package evmrpc + +import ( + "fmt" + "sync" + "testing" + "time" + + "github.com/hashicorp/golang-lru/v2/expirable" + "github.com/stretchr/testify/require" + + sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" +) + +func newTestBackend(size int, ttl time.Duration) *Backend { + return &Backend{ + replayStateCache: expirable.NewLRU[string, *blockReplayState](size, nil, ttl), + } +} + +// TestReplayStateCache_GetPut exercises the basic round-trip and the +// "best checkpoint <= txIndex" selection logic used to resume a trace. +func TestReplayStateCache_GetPut(t *testing.T) { + b := newTestBackend(replayStateCacheBlocks, replayStateCacheTTL) + hash := "0xabc" + + ctx0 := sdk.Context{}.WithBlockHeight(100) + ctx5 := sdk.Context{}.WithBlockHeight(105) + ctx10 := sdk.Context{}.WithBlockHeight(110) + + b.putReplayState(hash, -1, ctx0) + b.putReplayState(hash, 5, ctx5) + b.putReplayState(hash, 10, ctx10) + + // Asking for txIndex=7 should return the checkpoint at idx=5. + got, idx, ok := b.getReplayState(hash, 7) + require.True(t, ok) + require.Equal(t, 5, idx) + require.Equal(t, int64(105), got.BlockHeight()) + + // Asking for txIndex=0 should return the -1 checkpoint. + got, idx, ok = b.getReplayState(hash, 0) + require.True(t, ok) + require.Equal(t, -1, idx) + require.Equal(t, int64(100), got.BlockHeight()) + + // Unknown block returns false. + _, _, ok = b.getReplayState("0xmissing", 0) + require.False(t, ok) +} + +// TestReplayStateCache_EvictsOldBlocks is the regression test for the +// unbounded-memory-growth bug: distinct blocks beyond the cache cap must +// be evicted, not retained forever. +func TestReplayStateCache_EvictsOldBlocks(t *testing.T) { + const cap = 4 + b := newTestBackend(cap, time.Hour) + + // Insert more distinct blocks than the cap. + for i := 0; i < cap*3; i++ { + hash := fmt.Sprintf("0x%d", i) + b.putReplayState(hash, 0, sdk.Context{}.WithBlockHeight(int64(i))) + } + + require.LessOrEqual(t, b.replayStateCache.Len(), cap, + "cache must not grow beyond its configured cap") + + // The earliest blocks are gone. + _, _, ok := b.getReplayState("0x0", 0) + require.False(t, ok, "oldest block must have been evicted") + + // The most recent blocks are still present. + _, _, ok = b.getReplayState(fmt.Sprintf("0x%d", cap*3-1), 0) + require.True(t, ok, "most recently added block must still be cached") +} + +// TestReplayStateCache_Concurrent runs parallel puts/gets to catch +// data races on the per-block inner map. Run with `go test -race`. +func TestReplayStateCache_Concurrent(t *testing.T) { + b := newTestBackend(replayStateCacheBlocks, replayStateCacheTTL) + + var wg sync.WaitGroup + for w := 0; w < 8; w++ { + wg.Add(1) + go func(w int) { + defer wg.Done() + hash := fmt.Sprintf("block-%d", w%4) // 4 distinct blocks, contention on each + for i := 0; i < 100; i++ { + b.putReplayState(hash, i, sdk.Context{}.WithBlockHeight(int64(i))) + _, _, _ = b.getReplayState(hash, i) + } + }(w) + } + wg.Wait() +} diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index a3c8dd4e37..70d7ea00ba 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -552,10 +552,10 @@ func (db *Database) Prune(version int64) (_err error) { defer func() { _ = batch.Close() }() var ( - counter int - prevKey []byte - prevVersionDecoded int64 - prevStore string + counter int + prevKey []byte + keptBelowPrune bool + prevStore string ) for itr.First(); itr.Valid(); { @@ -603,39 +603,43 @@ func (db *Database) Prune(version int64) (_err error) { continue } - // Seek to next key if we are at a version which is higher than prune height - // Do not seek to next key if KeepLastVersion is false and we need to delete the previous key in pruning - if currVersionDecoded > version && (db.config.KeepLastVersion || prevVersionDecoded > version) { - itr.NextPrefix() - continue - } - - // With descending MVCC ordering, the first version seen for a logical key is - // the newest one. Any later version for the same key is older and can be - // pruned once it falls below the prune height. If KeepLastVersion is false, - // even the first/only version at or below the prune height can be deleted. - if currVersionDecoded <= version && (bytes.Equal(prevKey, currKey) || !db.config.KeepLastVersion) { - err = batch.Delete(currKeyEncoded, nil) - if err != nil { - return err + // Reset per-logical-key state when the logical key changes. + if !bytes.Equal(prevKey, currKey) { + prevKey = slices.Clone(currKey) + keptBelowPrune = false + + // Fast path: under descending encoding, versions of a key are stored + // newest-first. When the newest real version is above the prune + // height, seek directly to the first version <= prune height for + // this key instead of iterating through every above-prune version. + if currVersionDecoded > version { + itr.SeekGE(MVCCEncode(currKey, version)) + continue } + } - counter++ - if counter >= PruneCommitBatchSize { - err = batch.Commit(defaultWriteOpts) - if err != nil { + // Descending iteration: for a given logical key we see newest→oldest. + // Versions > prune height are always kept. For versions <= prune + // height, keep only the newest one when KeepLastVersion is true; + // delete every other such version. + if currVersionDecoded <= version { + if db.config.KeepLastVersion && !keptBelowPrune { + keptBelowPrune = true + } else { + if err := batch.Delete(currKeyEncoded, nil); err != nil { return err } - - counter = 0 - batch.Reset() + counter++ + if counter >= PruneCommitBatchSize { + if err := batch.Commit(defaultWriteOpts); err != nil { + return err + } + counter = 0 + batch.Reset() + } } } - // Update prevKey and prevVersion for next iteration - prevKey = currKey - prevVersionDecoded = currVersionDecoded - itr.Next() } diff --git a/sei-db/db_engine/pebbledb/mvcc/prune_test.go b/sei-db/db_engine/pebbledb/mvcc/prune_test.go new file mode 100644 index 0000000000..00ef3c77f1 --- /dev/null +++ b/sei-db/db_engine/pebbledb/mvcc/prune_test.go @@ -0,0 +1,137 @@ +package mvcc + +import ( + "testing" + + "github.com/cockroachdb/pebble/v2" + "github.com/stretchr/testify/require" + + "github.com/sei-protocol/sei-chain/sei-db/config" + "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +// rawVersionsForKey returns every on-disk MVCC version for (store, key), +// excluding the sentinel latest-pointer entry. Used to assert pruning +// actually deletes data rather than just bumping earliestVersion. +func rawVersionsForKey(t *testing.T, db *Database, store string, key []byte) []int64 { + t.Helper() + prefix := prependStoreKey(store, key) + lower := MVCCEncode(prefix, 0) + upper := MVCCEncode(append(append([]byte{}, prefix...), 0x01), 0) + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lower, UpperBound: upper}) + require.NoError(t, err) + defer func() { _ = itr.Close() }() + + var versions []int64 + for itr.First(); itr.Valid(); itr.Next() { + _, vBz, ok := SplitMVCCKey(itr.Key()) + require.True(t, ok) + v, err := decodeUint64Descending(vBz) + require.NoError(t, err) + if v == latestPointerVersion { + continue + } + versions = append(versions, v) + } + return versions +} + +func applyVersion(t *testing.T, db *Database, store string, v int64, key, val []byte) { + t.Helper() + require.NoError(t, db.ApplyChangesetSync(v, []*proto.NamedChangeSet{{ + Name: store, + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{{Key: key, Value: val}}}, + }})) +} + +func newTestDB(t *testing.T, keepLast bool) *Database { + t.Helper() + cfg := config.DefaultStateStoreConfig() + cfg.Backend = "pebbledb" + cfg.KeepLastVersion = keepLast + store, err := OpenDB(t.TempDir(), cfg) + require.NoError(t, err) + db := store.(*Database) + t.Cleanup(func() { _ = db.Close() }) + return db +} + +// TestPruneDescendingOrder_DeletesOldVersions is a regression test for the +// descending-encoding prune bug: when the newest version of a key is above +// the prune height, older versions that fall below the prune height must +// still be physically deleted. The previous logic called NextPrefix() on +// hitting the newest version and leaked every older version. +func TestPruneDescendingOrder_DeletesOldVersions(t *testing.T) { + const store = "store1" + key := []byte("k") + + t.Run("KeepLastVersion=true leaves newest + newest-below-prune", func(t *testing.T) { + db := newTestDB(t, true) + + applyVersion(t, db, store, 50, key, []byte("v50")) + applyVersion(t, db, store, 100, key, []byte("v100")) + applyVersion(t, db, store, 200, key, []byte("v200")) + + require.NoError(t, db.Prune(150)) + + versions := rawVersionsForKey(t, db, store, key) + require.ElementsMatch(t, []int64{100, 200}, versions, + "v50 must be physically deleted; v100 kept as newest below prune; v200 kept as above prune") + }) + + t.Run("KeepLastVersion=false deletes every version <= prune", func(t *testing.T) { + db := newTestDB(t, false) + + applyVersion(t, db, store, 50, key, []byte("v50")) + applyVersion(t, db, store, 100, key, []byte("v100")) + applyVersion(t, db, store, 200, key, []byte("v200")) + + require.NoError(t, db.Prune(150)) + + versions := rawVersionsForKey(t, db, store, key) + require.ElementsMatch(t, []int64{200}, versions, + "everything at or below prune height must be deleted when KeepLastVersion=false") + }) + + t.Run("all versions above prune are retained", func(t *testing.T) { + db := newTestDB(t, true) + + applyVersion(t, db, store, 200, key, []byte("v200")) + applyVersion(t, db, store, 300, key, []byte("v300")) + + require.NoError(t, db.Prune(150)) + + versions := rawVersionsForKey(t, db, store, key) + require.ElementsMatch(t, []int64{200, 300}, versions) + }) + + t.Run("multiple keys pruned independently", func(t *testing.T) { + db := newTestDB(t, true) + + k1, k2 := []byte("k1"), []byte("k2") + applyVersion(t, db, store, 50, k1, []byte("a")) + applyVersion(t, db, store, 100, k1, []byte("b")) + applyVersion(t, db, store, 200, k1, []byte("c")) + + applyVersion(t, db, store, 60, k2, []byte("x")) + applyVersion(t, db, store, 140, k2, []byte("y")) + + require.NoError(t, db.Prune(150)) + + require.ElementsMatch(t, []int64{100, 200}, rawVersionsForKey(t, db, store, k1)) + require.ElementsMatch(t, []int64{140}, rawVersionsForKey(t, db, store, k2)) + }) + + t.Run("latest-pointer sentinel is never pruned", func(t *testing.T) { + db := newTestDB(t, true) + applyVersion(t, db, store, 50, key, []byte("v50")) + applyVersion(t, db, store, 100, key, []byte("v100")) + + require.NoError(t, db.Prune(150)) + + // Sentinel must still serve the latest-value fast path. + bz, err := db.Get(store, 1000, key) + require.NoError(t, err) + require.Equal(t, []byte("v100"), bz) + }) +} From 1edd3eddb058f43b0c32429642bfe77e7ec3dba8 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 00:08:59 -0400 Subject: [PATCH 04/15] fix: satisfy golangci-lint (copylocks, gosec, defers, staticcheck) - Change replayStateCacheMu to *sync.Mutex so Backend copies stay safe under copylocks (Backend has value-receiver methods and tracers.go dereferences the pointer into a local). - Annotate two int64->uint64 conversions on non-negative block heights with nolint:gosec, matching existing conventions in this package. - Wrap recordReadTrace defer calls in closures so time.Since is evaluated at defer fire time, not at defer registration. - Drop redundant Database selector from tracedDatabase iterator calls. --- evmrpc/simulate.go | 3 +- evmrpc/simulate_cache_test.go | 3 +- sei-db/db_engine/pebbledb/mvcc/batch.go | 2 +- sei-db/db_engine/pebbledb/mvcc/db.go | 38 ++++++++++++++----------- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index bcc0c8bc3d..2b09c1bf57 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -231,7 +231,7 @@ type Backend struct { globalBlockCache BlockCache cacheCreationMutex *sync.Mutex watermarks *WatermarkManager - replayStateCacheMu sync.Mutex + replayStateCacheMu *sync.Mutex replayStateCache *expirable.LRU[string, *blockReplayState] } @@ -273,6 +273,7 @@ func NewBackend( globalBlockCache: globalBlockCache, cacheCreationMutex: cacheCreationMutex, watermarks: watermarks, + replayStateCacheMu: &sync.Mutex{}, replayStateCache: expirable.NewLRU[string, *blockReplayState]( replayStateCacheBlocks, nil, replayStateCacheTTL, ), diff --git a/evmrpc/simulate_cache_test.go b/evmrpc/simulate_cache_test.go index 48ddcb1ac9..a43e180169 100644 --- a/evmrpc/simulate_cache_test.go +++ b/evmrpc/simulate_cache_test.go @@ -14,7 +14,8 @@ import ( func newTestBackend(size int, ttl time.Duration) *Backend { return &Backend{ - replayStateCache: expirable.NewLRU[string, *blockReplayState](size, nil, ttl), + replayStateCacheMu: &sync.Mutex{}, + replayStateCache: expirable.NewLRU[string, *blockReplayState](size, nil, ttl), } } diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index fd4f88fb17..e7a10cdd03 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -107,7 +107,7 @@ func (b *Batch) Write() (err error) { return err } var versionBz [VersionSize]byte - binary.LittleEndian.PutUint64(versionBz[:], uint64(b.version)) + binary.LittleEndian.PutUint64(versionBz[:], uint64(b.version)) //nolint:gosec // block heights are non-negative and fit in int64 if err := b.storage.Set([]byte(latestVersionKey), versionBz[:], defaultWriteOpts); err != nil { return fmt.Errorf("failed to update latest version after batch commit: %w", err) } diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 70d7ea00ba..c8112752fa 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -348,13 +348,15 @@ func (db *Database) Has(storeKey string, version int64, key []byte) (bool, error func (db *Database) hasWithCollector(storeKey string, version int64, key []byte, collector types.ReadTraceCollector) (bool, error) { start := time.Now() - defer recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "has", - DurationNanos: time.Since(start).Nanoseconds(), - Key: slices.Clone(key), - }) + defer func() { + recordReadTrace(collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "has", + DurationNanos: time.Since(start).Nanoseconds(), + Key: slices.Clone(key), + }) + }() if version < db.GetEarliestVersion() { return false, nil } @@ -1218,13 +1220,15 @@ func (db *tracedDatabase) Get(storeKey string, version int64, key []byte) ([]byt func (db *tracedDatabase) Has(storeKey string, version int64, key []byte) (bool, error) { start := time.Now() - defer recordReadTrace(db.collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "has", - DurationNanos: time.Since(start).Nanoseconds(), - Key: slices.Clone(key), - }) + defer func() { + recordReadTrace(db.collector, types.ReadTraceEvent{ + StoreKey: storeKey, + Layer: "mvcc", + Operation: "has", + DurationNanos: time.Since(start).Nanoseconds(), + Key: slices.Clone(key), + }) + }() val, err := db.getWithSession(storeKey, version, key) if err != nil { return false, err @@ -1233,11 +1237,11 @@ func (db *tracedDatabase) Has(storeKey string, version int64, key []byte) (bool, } func (db *tracedDatabase) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - return db.Database.iteratorWithCollector(storeKey, version, start, end, db.collector) + return db.iteratorWithCollector(storeKey, version, start, end, db.collector) } func (db *tracedDatabase) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - return db.Database.reverseIteratorWithCollector(storeKey, version, start, end, db.collector) + return db.reverseIteratorWithCollector(storeKey, version, start, end, db.collector) } func recordReadTrace(collector types.ReadTraceCollector, event types.ReadTraceEvent) { @@ -1428,7 +1432,7 @@ func iteratorUpperBoundForLogicalKey(key []byte) []byte { // their requested version and decode tombstone/value info. func encodeLatestPointerValue(version int64, prefixedVal []byte) []byte { var versionBz [VersionSize]byte - binary.LittleEndian.PutUint64(versionBz[:], uint64(version)) + binary.LittleEndian.PutUint64(versionBz[:], uint64(version)) //nolint:gosec // block heights are non-negative and fit in int64 return append(versionBz[:], prefixedVal...) } From fd059b2e88142e97be94a82172a25ee0ad040355 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 09:29:13 -0400 Subject: [PATCH 05/15] perf: remove sentinel latest-value fast path Drops the reserved math.MaxInt64 latest-pointer entry and returns the historical read path to the inverted-order SeekGE lookup plus the request-scoped read cache and reusable iterator session. Benchmarks showed the sentinel pointer made traces slower, not faster: the extra db.Get per read plus the write amplification from emitting a pointer entry per set outweighed the bloom-filter-accelerated hit. This keeps the other MVCC wins while removing the sentinel. Also drops the sentinel-skip branches in Prune/RawIterate/iterator and the corresponding sentinel-specific prune test. --- sei-db/db_engine/pebbledb/mvcc/batch.go | 12 --- sei-db/db_engine/pebbledb/mvcc/db.go | 100 ------------------- sei-db/db_engine/pebbledb/mvcc/iterator.go | 4 - sei-db/db_engine/pebbledb/mvcc/prune_test.go | 21 +--- 4 files changed, 3 insertions(+), 134 deletions(-) diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index e7a10cdd03..e2ba5e5696 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -51,14 +51,6 @@ func (b *Batch) set(storeKey string, tombstone int64, key, value []byte) error { prefixedVal := MVCCEncode(value, tombstone) b.appendSet(prefixedKey, prefixedVal) - // Also write a latest-version pointer at the sentinel version for fast - // direct-Get reads at recent heights. The sentinel is a reserved real - // MVCC version (math.MaxInt64), so the custom comparer parses it exactly - // like any other historical entry and no separate keyspace is introduced. - if storeKey != "" && len(key) > 0 { - latestPtrKey := MVCCEncode(prependStoreKey(storeKey, key), latestPointerVersion) - b.appendSet(latestPtrKey, encodeLatestPointerValue(b.version, prefixedVal)) - } return nil } @@ -140,10 +132,6 @@ func (b *RawBatch) set(storeKey string, tombstone int64, key, value []byte, vers prefixedVal := MVCCEncode(value, tombstone) b.appendSet(prefixedKey, prefixedVal) - if storeKey != "" && len(key) > 0 { - latestPtrKey := MVCCEncode(prependStoreKey(storeKey, key), latestPointerVersion) - b.appendSet(latestPtrKey, encodeLatestPointerValue(version, prefixedVal)) - } return nil } diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index c8112752fa..3495accdac 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -38,14 +38,6 @@ const ( earliestVersionKey = "s/_earliest" tombstoneVal = "TOMBSTONE" - // latestPointerVersion is a reserved MVCC version used to store a per-key - // "latest value" pointer alongside normal historical entries. Real block - // heights are always far below math.MaxInt64, so this version is never a - // real entry. Under descending version encoding it sorts first per key, - // which lets a direct db.Get(MVCCEncode(key, latestPointerVersion)) serve - // as a bloom-filter accelerated fast path. - latestPointerVersion = int64(math.MaxInt64) - // TODO: Make configurable ImportCommitBatchSize = 10000 PruneCommitBatchSize = 50 @@ -396,15 +388,6 @@ func (db *Database) getWithCollector(storeKey string, targetVersion int64, key [ return nil, nil } - if val, found, err := getLatestPointerValue( - func(k []byte) ([]byte, io.Closer, error) { return db.storage.Get(k) }, - storeKey, key, targetVersion, db.GetEarliestVersion(), db.config.KeepLastVersion, - ); err != nil { - return nil, err - } else if found { - return val, nil - } - prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion, collector) if err != nil { if errors.Is(err, errorutils.ErrRecordNotFound) { @@ -598,13 +581,6 @@ func (db *Database) Prune(version int64) (_err error) { return err } - // Skip the sentinel latest-pointer entry; prune only operates on real - // historical versions. The pointer is kept live and updated by writes. - if currVersionDecoded == latestPointerVersion { - itr.Next() - continue - } - // Reset per-logical-key state when the logical key changes. if !bytes.Equal(prevKey, currKey) { prevKey = slices.Clone(currKey) @@ -851,12 +827,6 @@ func (db *Database) RawIterate(storeKey string, fn func(key []byte, value []byte return false, err } - // Skip the sentinel latest-pointer entry; it is a derived index, not - // real data, and its value has a different encoding. - if currVersionDecoded == latestPointerVersion { - continue - } - // Decode the value currValEncoded := itr.Value() if valTombstoned(currValEncoded) { @@ -1274,16 +1244,6 @@ func (db *tracedDatabase) getWithSession(storeKey string, targetVersion int64, k return nil, nil } - if val, found, err := getLatestPointerValue( - func(k []byte) ([]byte, io.Closer, error) { return db.readSession.snapshot.Get(k) }, - storeKey, key, targetVersion, db.GetEarliestVersion(), db.config.KeepLastVersion, - ); err != nil { - return nil, err - } else if found { - db.readSession.store(storeKey, targetVersion, key, val) - return val, nil - } - if val, found := db.readSession.lookup(storeKey, targetVersion, key); found { recordReadTrace(db.collector, types.ReadTraceEvent{ StoreKey: storeKey, @@ -1427,66 +1387,6 @@ func iteratorUpperBoundForLogicalKey(key []byte) []byte { return MVCCEncode(upperKeyPrefix, 0) } -// encodeLatestPointerValue stores the write version alongside the actual -// prefixed MVCC value so readers can both validate the pointer is visible at -// their requested version and decode tombstone/value info. -func encodeLatestPointerValue(version int64, prefixedVal []byte) []byte { - var versionBz [VersionSize]byte - binary.LittleEndian.PutUint64(versionBz[:], uint64(version)) //nolint:gosec // block heights are non-negative and fit in int64 - return append(versionBz[:], prefixedVal...) -} - -func decodeLatestPointerValue(bz []byte) (int64, []byte, error) { - if len(bz) < VersionSize { - return 0, nil, fmt.Errorf("latest pointer entry too short: %d", len(bz)) - } - v := binary.LittleEndian.Uint64(bz[:VersionSize]) - if v > math.MaxInt64 { - return 0, nil, fmt.Errorf("latest pointer version overflows int64: %d", v) - } - return int64(v), bz[VersionSize:], nil -} - -// getLatestPointerValue is the fast-path read: a single bloom-filter accelerated -// db.Get. Returns (value, true, nil) only when the pointer is visible at the -// caller's targetVersion. Falls back to the MVCC scan path otherwise. -func getLatestPointerValue( - getter func(key []byte) ([]byte, io.Closer, error), - storeKey string, - key []byte, - targetVersion, earliestVersion int64, - keepLastVersion bool, -) ([]byte, bool, error) { - if storeKey == "" || len(key) == 0 { - return nil, false, nil - } - latestKey := MVCCEncode(prependStoreKey(storeKey, key), latestPointerVersion) - val, closer, err := getter(latestKey) - if err != nil { - if errors.Is(err, pebble.ErrNotFound) { - return nil, false, nil - } - return nil, false, fmt.Errorf("failed latest-pointer lookup: %w", err) - } - defer func() { _ = closer.Close() }() - - latestVersion, prefixedVal, err := decodeLatestPointerValue(utils.Clone(val)) - if err != nil { - return nil, false, err - } - if latestVersion < earliestVersion && !keepLastVersion { - return nil, false, nil - } - if latestVersion > targetVersion { - return nil, false, nil - } - value, err := visibleValueAtVersion(prefixedVal, targetVersion) - if err != nil { - return nil, false, err - } - return value, true, nil -} - func valTombstoned(value []byte) bool { if value == nil { return false diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator.go b/sei-db/db_engine/pebbledb/mvcc/iterator.go index 6715e80a86..778b2ec219 100644 --- a/sei-db/db_engine/pebbledb/mvcc/iterator.go +++ b/sei-db/db_engine/pebbledb/mvcc/iterator.go @@ -128,10 +128,6 @@ func (itr *iterator) seekVisibleVersionForKey(targetKey []byte) bool { if err != nil { return false } - // Never surface the sentinel latest-pointer entry through iteration. - if foundVersionDecoded == latestPointerVersion { - return false - } return foundVersionDecoded <= itr.version } diff --git a/sei-db/db_engine/pebbledb/mvcc/prune_test.go b/sei-db/db_engine/pebbledb/mvcc/prune_test.go index 00ef3c77f1..909a2666be 100644 --- a/sei-db/db_engine/pebbledb/mvcc/prune_test.go +++ b/sei-db/db_engine/pebbledb/mvcc/prune_test.go @@ -10,9 +10,9 @@ import ( "github.com/sei-protocol/sei-chain/sei-db/proto" ) -// rawVersionsForKey returns every on-disk MVCC version for (store, key), -// excluding the sentinel latest-pointer entry. Used to assert pruning -// actually deletes data rather than just bumping earliestVersion. +// rawVersionsForKey returns every on-disk MVCC version for (store, key). Used +// to assert pruning actually deletes data rather than just bumping +// earliestVersion. func rawVersionsForKey(t *testing.T, db *Database, store string, key []byte) []int64 { t.Helper() prefix := prependStoreKey(store, key) @@ -28,9 +28,6 @@ func rawVersionsForKey(t *testing.T, db *Database, store string, key []byte) []i require.True(t, ok) v, err := decodeUint64Descending(vBz) require.NoError(t, err) - if v == latestPointerVersion { - continue - } versions = append(versions, v) } return versions @@ -122,16 +119,4 @@ func TestPruneDescendingOrder_DeletesOldVersions(t *testing.T) { require.ElementsMatch(t, []int64{140}, rawVersionsForKey(t, db, store, k2)) }) - t.Run("latest-pointer sentinel is never pruned", func(t *testing.T) { - db := newTestDB(t, true) - applyVersion(t, db, store, 50, key, []byte("v50")) - applyVersion(t, db, store, 100, key, []byte("v100")) - - require.NoError(t, db.Prune(150)) - - // Sentinel must still serve the latest-value fast path. - bz, err := db.Get(store, 1000, key) - require.NoError(t, err) - require.Equal(t, []byte("v100"), bz) - }) } From ef9b1859dc55d07f920d755408f4dbb04177cc56 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 09:40:33 -0400 Subject: [PATCH 06/15] chore: review cleanups - Drop unused batchOp.order field; sortBatchOps sorts by key only. - Drop unused NewReadTraceEvent helper and its time import. - Fix misleading Reverse:true trace events in getMVCCSlice and getMVCCSliceWithSession; these paths are forward SeekGE + First. - Drop stale "reverse iteration is NOT supported" comment on iterator. - Rename shadowed builtin cap to size in replay-cache eviction test. --- evmrpc/simulate_cache_test.go | 17 +++++++---------- sei-db/db_engine/pebbledb/mvcc/batch.go | 4 ---- sei-db/db_engine/pebbledb/mvcc/db.go | 6 ------ sei-db/db_engine/pebbledb/mvcc/iterator.go | 5 +---- sei-db/db_engine/types/types.go | 10 ---------- 5 files changed, 8 insertions(+), 34 deletions(-) diff --git a/evmrpc/simulate_cache_test.go b/evmrpc/simulate_cache_test.go index a43e180169..f9bca37107 100644 --- a/evmrpc/simulate_cache_test.go +++ b/evmrpc/simulate_cache_test.go @@ -51,27 +51,24 @@ func TestReplayStateCache_GetPut(t *testing.T) { } // TestReplayStateCache_EvictsOldBlocks is the regression test for the -// unbounded-memory-growth bug: distinct blocks beyond the cache cap must +// unbounded-memory-growth bug: distinct blocks beyond the cache size must // be evicted, not retained forever. func TestReplayStateCache_EvictsOldBlocks(t *testing.T) { - const cap = 4 - b := newTestBackend(cap, time.Hour) + const size = 4 + b := newTestBackend(size, time.Hour) - // Insert more distinct blocks than the cap. - for i := 0; i < cap*3; i++ { + for i := 0; i < size*3; i++ { hash := fmt.Sprintf("0x%d", i) b.putReplayState(hash, 0, sdk.Context{}.WithBlockHeight(int64(i))) } - require.LessOrEqual(t, b.replayStateCache.Len(), cap, - "cache must not grow beyond its configured cap") + require.LessOrEqual(t, b.replayStateCache.Len(), size, + "cache must not grow beyond its configured size") - // The earliest blocks are gone. _, _, ok := b.getReplayState("0x0", 0) require.False(t, ok, "oldest block must have been evicted") - // The most recent blocks are still present. - _, _, ok = b.getReplayState(fmt.Sprintf("0x%d", cap*3-1), 0) + _, _, ok = b.getReplayState(fmt.Sprintf("0x%d", size*3-1), 0) require.True(t, ok, "most recently added block must still be cached") } diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index e2ba5e5696..8541767291 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -23,7 +23,6 @@ type batchOp struct { key []byte value []byte delete bool - order int } func NewBatch(storage *pebble.DB, version int64) (*Batch, error) { @@ -190,7 +189,6 @@ func (b *Batch) appendSet(key, value []byte) { b.ops = append(b.ops, batchOp{ key: append([]byte(nil), key...), value: append([]byte(nil), value...), - order: len(b.ops), }) } @@ -198,7 +196,6 @@ func (b *Batch) appendDelete(key []byte) { b.ops = append(b.ops, batchOp{ key: append([]byte(nil), key...), delete: true, - order: len(b.ops), }) } @@ -206,7 +203,6 @@ func (b *RawBatch) appendSet(key, value []byte) { b.ops = append(b.ops, batchOp{ key: append([]byte(nil), key...), value: append([]byte(nil), value...), - order: len(b.ops), }) } diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 3495accdac..f0fc0d1d64 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -955,7 +955,6 @@ func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, col DurationNanos: time.Since(iterStart).Nanoseconds(), Start: slices.Clone(lowerBound), End: slices.Clone(upperBound), - Reverse: true, }) defer func() { closeStart := time.Now() @@ -966,7 +965,6 @@ func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, col Operation: "iterClose", DurationNanos: time.Since(closeStart).Nanoseconds(), Key: slices.Clone(key), - Reverse: true, }) }() @@ -991,7 +989,6 @@ func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, col Operation: "iterKey", DurationNanos: time.Since(keyReadStart).Nanoseconds(), Key: rawIterKey, - Reverse: true, }) splitKeyStart := time.Now() @@ -1034,7 +1031,6 @@ func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, col Operation: "iterValue", DurationNanos: time.Since(valueReadStart).Nanoseconds(), Key: rawIterKey, - Reverse: true, }) valueCloneStart := time.Now() @@ -1114,7 +1110,6 @@ func getMVCCSliceWithSession(session *historicalReadSession, storeKey string, ke Operation: "iterKey", DurationNanos: time.Since(keyReadStart).Nanoseconds(), Key: rawIterKey, - Reverse: true, }) splitKeyStart := time.Now() @@ -1156,7 +1151,6 @@ func getMVCCSliceWithSession(session *historicalReadSession, storeKey string, ke Operation: "iterValue", DurationNanos: time.Since(valueReadStart).Nanoseconds(), Key: rawIterKey, - Reverse: true, }) valueCloneStart := time.Now() diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator.go b/sei-db/db_engine/pebbledb/mvcc/iterator.go index 778b2ec219..e45ecee162 100644 --- a/sei-db/db_engine/pebbledb/mvcc/iterator.go +++ b/sei-db/db_engine/pebbledb/mvcc/iterator.go @@ -23,10 +23,7 @@ var _ types.DBIterator = (*iterator)(nil) // in the provided domain for a given version. If a key has been written at the // provided version, that key/value pair will be iterated over. Otherwise, the // latest version for that key/value pair will be iterated over s.t. it's less -// than the provided version. Note: -// -// - The start key must not be empty. -// - Currently, reverse iteration is NOT supported. +// than the provided version. The start key must not be empty. type iterator struct { source *pebble.Iterator prefix, start, end []byte diff --git a/sei-db/db_engine/types/types.go b/sei-db/db_engine/types/types.go index a56366900b..6665c3982b 100644 --- a/sei-db/db_engine/types/types.go +++ b/sei-db/db_engine/types/types.go @@ -2,7 +2,6 @@ package types import ( "io" - "time" "github.com/sei-protocol/sei-chain/sei-db/proto" ) @@ -174,15 +173,6 @@ type TraceableStateStore interface { WithReadTraceCollector(ReadTraceCollector) StateStore } -func NewReadTraceEvent(storeKey, layer, operation string, duration time.Duration) ReadTraceEvent { - return ReadTraceEvent{ - StoreKey: storeKey, - Layer: layer, - Operation: operation, - DurationNanos: duration.Nanoseconds(), - } -} - // DBIterator iterates over versioned key-value pairs. type DBIterator interface { Domain() (start []byte, end []byte) From ed83a04388025eddd15dd0015197d55aa8638651 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 10:04:47 -0400 Subject: [PATCH 07/15] fix: atomic latest-version write, dedupe MVCC slice helpers - Move latestVersionKey Set into the pebble batch so data and version metadata commit atomically, restoring the invariant that crash recovery observes a consistent version pointer. - Extract decodeMVCCEntry to share the post-seek validate/decode/clone logic between getMVCCSlice and getMVCCSliceWithSession. - Drop the fine-grained per-step trace events on the Get hot path and route the remaining top-level timing through traceGetMVCCSlice, which short-circuits when no collector is attached (no slices.Clone in the default zero-collector path). Co-Authored-By: Claude Opus 4.7 --- sei-db/db_engine/pebbledb/mvcc/batch.go | 9 +- sei-db/db_engine/pebbledb/mvcc/db.go | 209 +++--------------------- 2 files changed, 30 insertions(+), 188 deletions(-) diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index 8541767291..cc06acc46b 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -94,15 +94,12 @@ func (b *Batch) Write() (err error) { return fmt.Errorf("failed to write PebbleDB batch: %w", e) } } - if err := batch.Commit(defaultWriteOpts); err != nil { - return err - } var versionBz [VersionSize]byte binary.LittleEndian.PutUint64(versionBz[:], uint64(b.version)) //nolint:gosec // block heights are non-negative and fit in int64 - if err := b.storage.Set([]byte(latestVersionKey), versionBz[:], defaultWriteOpts); err != nil { - return fmt.Errorf("failed to update latest version after batch commit: %w", err) + if err := batch.Set([]byte(latestVersionKey), versionBz[:], nil); err != nil { + return fmt.Errorf("failed to set latest version in batch: %w", err) } - return nil + return batch.Commit(defaultWriteOpts) } // For writing kv pairs in any order of version diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index f0fc0d1d64..3a0f702329 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -925,245 +925,90 @@ func parseStoreKey(key []byte) (string, error) { return keyStr[LenPrefixStore : LenPrefixStore+slashIndex], nil } -func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) ([]byte, error) { +func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) (_ []byte, err error) { totalStart := time.Now() - defer func() { - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "getMVCCSlice", - DurationNanos: time.Since(totalStart).Nanoseconds(), - Key: slices.Clone(key), - }) - }() + defer traceGetMVCCSlice(collector, storeKey, key, totalStart) + prefixedKey := prependStoreKey(storeKey, key) - seekKey := MVCCEncode(prefixedKey, version) - lowerBound := seekKey - upperBound := iteratorUpperBoundForLogicalKey(prefixedKey) - iterStart := time.Now() itr, err := db.NewIter(&pebble.IterOptions{ - LowerBound: lowerBound, - UpperBound: upperBound, + LowerBound: MVCCEncode(prefixedKey, version), + UpperBound: iteratorUpperBoundForLogicalKey(prefixedKey), }) if err != nil { return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) } - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "newIter", - DurationNanos: time.Since(iterStart).Nanoseconds(), - Start: slices.Clone(lowerBound), - End: slices.Clone(upperBound), - }) defer func() { - closeStart := time.Now() err = errorutils.Join(err, itr.Close()) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "iterClose", - DurationNanos: time.Since(closeStart).Nanoseconds(), - Key: slices.Clone(key), - }) }() - firstStart := time.Now() - firstOK := itr.First() - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "first", - DurationNanos: time.Since(firstStart).Nanoseconds(), - Key: slices.Clone(key), - }) - if !firstOK { - return nil, errorutils.ErrRecordNotFound - } - - keyReadStart := time.Now() - rawIterKey := slices.Clone(itr.Key()) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "iterKey", - DurationNanos: time.Since(keyReadStart).Nanoseconds(), - Key: rawIterKey, - }) - - splitKeyStart := time.Now() - userKey, vBz, ok := SplitMVCCKey(rawIterKey) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "splitKey", - DurationNanos: time.Since(splitKeyStart).Nanoseconds(), - Key: rawIterKey, - }) - if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) - } - if !bytes.Equal(userKey, prefixedKey) { - return nil, errorutils.ErrRecordNotFound - } - - decodeVersionStart := time.Now() - keyVersion, err := decodeUint64Descending(vBz) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "decodeKeyVersion", - DurationNanos: time.Since(decodeVersionStart).Nanoseconds(), - Key: rawIterKey, - }) - if err != nil { - return nil, fmt.Errorf("failed to decode key version: %w", err) - } - if keyVersion > version { + if !itr.First() { return nil, errorutils.ErrRecordNotFound } - - valueReadStart := time.Now() - rawIterValue := itr.Value() - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "iterValue", - DurationNanos: time.Since(valueReadStart).Nanoseconds(), - Key: rawIterKey, - }) - - valueCloneStart := time.Now() - clonedValue := slices.Clone(rawIterValue) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "cloneValue", - DurationNanos: time.Since(valueCloneStart).Nanoseconds(), - Key: rawIterKey, - }) - - return clonedValue, nil + return decodeMVCCEntry(itr.Key(), itr.Value(), prefixedKey, version) } func getMVCCSliceWithSession(session *historicalReadSession, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) ([]byte, error) { totalStart := time.Now() - defer func() { - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "getMVCCSlice", - DurationNanos: time.Since(totalStart).Nanoseconds(), - Key: slices.Clone(key), - }) - }() + defer traceGetMVCCSlice(collector, storeKey, key, totalStart) prefixedKey := prependStoreKey(storeKey, key) seekKey := MVCCEncode(prefixedKey, version) - itr, created, iterDuration, err := session.getOrCreateIterator(storeKey) + itr, _, _, err := session.getOrCreateIterator(storeKey) if err != nil { return nil, err } - if created { - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "newIter", - DurationNanos: iterDuration.Nanoseconds(), - Start: slices.Clone(MVCCEncode(prependStoreKey(storeKey, nil), 0)), - End: slices.Clone(iteratorUpperBoundForStore(storeKey)), - }) - } - seekStart := time.Now() session.mu.Lock() ok := itr.SeekGE(seekKey) - var ( - rawIterKey []byte - rawIterValue []byte - ) + var rawIterKey, rawIterValue []byte if ok { rawIterKey = slices.Clone(itr.Key()) - rawIterValue = slices.Clone(itr.Value()) + rawIterValue = itr.Value() // cloned by decodeMVCCEntry on the hit path } iterErr := itr.Error() session.mu.Unlock() - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "seekGE", - DurationNanos: time.Since(seekStart).Nanoseconds(), - Key: slices.Clone(seekKey), - }) + if iterErr != nil { return nil, iterErr } if !ok { return nil, errorutils.ErrRecordNotFound } + return decodeMVCCEntry(rawIterKey, rawIterValue, prefixedKey, version) +} - keyReadStart := time.Now() - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "iterKey", - DurationNanos: time.Since(keyReadStart).Nanoseconds(), - Key: rawIterKey, - }) - - splitKeyStart := time.Now() +// decodeMVCCEntry validates that the iterator's current entry belongs to +// prefixedKey at a version <= target and returns a safe copy of the value. +func decodeMVCCEntry(rawIterKey, rawIterValue, prefixedKey []byte, version int64) ([]byte, error) { userKey, vBz, ok := SplitMVCCKey(rawIterKey) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "splitKey", - DurationNanos: time.Since(splitKeyStart).Nanoseconds(), - Key: rawIterKey, - }) if !ok { return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) } if !bytes.Equal(userKey, prefixedKey) { return nil, errorutils.ErrRecordNotFound } - - decodeVersionStart := time.Now() keyVersion, err := decodeUint64Descending(vBz) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "decodeKeyVersion", - DurationNanos: time.Since(decodeVersionStart).Nanoseconds(), - Key: rawIterKey, - }) if err != nil { return nil, fmt.Errorf("failed to decode key version: %w", err) } if keyVersion > version { return nil, errorutils.ErrRecordNotFound } + return slices.Clone(rawIterValue), nil +} - valueReadStart := time.Now() - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "iterValue", - DurationNanos: time.Since(valueReadStart).Nanoseconds(), - Key: rawIterKey, - }) - - valueCloneStart := time.Now() - clonedValue := slices.Clone(rawIterValue) - recordReadTrace(collector, types.ReadTraceEvent{ +func traceGetMVCCSlice(collector types.ReadTraceCollector, storeKey string, key []byte, start time.Time) { + if collector == nil { + return + } + collector.RecordReadTrace(types.ReadTraceEvent{ StoreKey: storeKey, Layer: "mvcc", - Operation: "cloneValue", - DurationNanos: time.Since(valueCloneStart).Nanoseconds(), - Key: rawIterKey, + Operation: "getMVCCSlice", + DurationNanos: time.Since(start).Nanoseconds(), + Key: slices.Clone(key), }) - - return clonedValue, nil } func (db *Database) WithReadTraceCollector(collector types.ReadTraceCollector) types.StateStore { From 2842f68ffeab08b1d66b07cf969b232f027d9129 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 12:04:51 -0400 Subject: [PATCH 08/15] chore: remove read-trace infrastructure from perf-mvcc Drops the ReadTrace types, per-op trace hooks, tracedDatabase wrapper, and historicalReadSession read-cache. This branch is scoped to MVCC perf changes; the tracing plumbing belongs with the debug trace-profile endpoint PR and can be re-introduced there once it lands. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/db_engine/pebbledb/mvcc/db.go | 323 +-------------------- sei-db/db_engine/pebbledb/mvcc/iterator.go | 61 ++-- sei-db/db_engine/types/types.go | 19 -- 3 files changed, 23 insertions(+), 380 deletions(-) diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 3a0f702329..5135e07afb 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -6,7 +6,6 @@ import ( "encoding/binary" "errors" "fmt" - "io" "math" "strings" "sync" @@ -46,41 +45,11 @@ const ( ) var ( - _ types.StateStore = (*Database)(nil) - _ types.TraceableStateStore = (*Database)(nil) + _ types.StateStore = (*Database)(nil) defaultWriteOpts = pebble.NoSync ) -type tracedDatabase struct { - *Database - collector types.ReadTraceCollector - readSession *historicalReadSession -} - -type readTraceCloserRegistry interface { - AddReadTraceCloser(io.Closer) -} - -type historicalReadSession struct { - snapshot *pebble.Snapshot - iterators map[string]*pebble.Iterator - cache map[historicalReadCacheKey]historicalReadCacheValue - mu sync.Mutex - closed bool -} - -type historicalReadCacheKey struct { - storeKey string - version int64 - key string -} - -type historicalReadCacheValue struct { - value []byte - found bool -} - type Database struct { storage *pebble.DB asyncWriteWG sync.WaitGroup @@ -335,25 +304,11 @@ func retrieveEarliestVersion(db *pebble.DB) (int64, error) { } func (db *Database) Has(storeKey string, version int64, key []byte) (bool, error) { - return db.hasWithCollector(storeKey, version, key, nil) -} - -func (db *Database) hasWithCollector(storeKey string, version int64, key []byte, collector types.ReadTraceCollector) (bool, error) { - start := time.Now() - defer func() { - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "has", - DurationNanos: time.Since(start).Nanoseconds(), - Key: slices.Clone(key), - }) - }() if version < db.GetEarliestVersion() { return false, nil } - val, err := db.getWithCollector(storeKey, version, key, collector) + val, err := db.Get(storeKey, version, key) if err != nil { return false, err } @@ -362,10 +317,6 @@ func (db *Database) hasWithCollector(storeKey string, version int64, key []byte, } func (db *Database) Get(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { - return db.getWithCollector(storeKey, targetVersion, key, nil) -} - -func (db *Database) getWithCollector(storeKey string, targetVersion int64, key []byte, collector types.ReadTraceCollector) (_ []byte, _err error) { startTime := time.Now() defer func() { otelMetrics.getLatency.Record( @@ -376,19 +327,12 @@ func (db *Database) getWithCollector(storeKey string, targetVersion int64, key [ attribute.String("store", storeKey), ), ) - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "get", - DurationNanos: time.Since(startTime).Nanoseconds(), - Key: slices.Clone(key), - }) }() if targetVersion < db.GetEarliestVersion() { return nil, nil } - prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion, collector) + prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion) if err != nil { if errors.Is(err, errorutils.ErrRecordNotFound) { return nil, nil @@ -633,10 +577,6 @@ func (db *Database) Prune(version int64) (_err error) { } func (db *Database) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - return db.iteratorWithCollector(storeKey, version, start, end, nil) -} - -func (db *Database) iteratorWithCollector(storeKey string, version int64, start, end []byte, collector types.ReadTraceCollector) (types.DBIterator, error) { if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { return nil, errorutils.ErrKeyEmpty } @@ -654,21 +594,12 @@ func (db *Database) iteratorWithCollector(storeKey string, version int64, start, upperBound = iteratorUpperBoundForStore(storeKey) } - iterStart := time.Now() itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) if err != nil { return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) } - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "newIter", - DurationNanos: time.Since(iterStart).Nanoseconds(), - Start: slices.Clone(lowerBound), - End: slices.Clone(upperBound), - }) - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey, collector), nil + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil } // Taken from pebbledb prefix upper bound @@ -686,10 +617,6 @@ func prefixEnd(b []byte) []byte { } func (db *Database) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - return db.reverseIteratorWithCollector(storeKey, version, start, end, nil) -} - -func (db *Database) reverseIteratorWithCollector(storeKey string, version int64, start, end []byte, collector types.ReadTraceCollector) (types.DBIterator, error) { if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { return nil, errorutils.ErrKeyEmpty } @@ -707,22 +634,12 @@ func (db *Database) reverseIteratorWithCollector(storeKey string, version int64, upperBound = MVCCEncode(prefixEnd(storePrefix(storeKey)), 0) } - iterStart := time.Now() itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) if err != nil { return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) } - recordReadTrace(collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "pebble", - Operation: "newIter", - DurationNanos: time.Since(iterStart).Nanoseconds(), - Start: slices.Clone(lowerBound), - End: slices.Clone(upperBound), - Reverse: true, - }) - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey, collector), nil + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil } // Import loads the initial version of the state in parallel with numWorkers goroutines @@ -925,10 +842,7 @@ func parseStoreKey(key []byte) (string, error) { return keyStr[LenPrefixStore : LenPrefixStore+slashIndex], nil } -func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) (_ []byte, err error) { - totalStart := time.Now() - defer traceGetMVCCSlice(collector, storeKey, key, totalStart) - +func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64) (_ []byte, err error) { prefixedKey := prependStoreKey(storeKey, key) itr, err := db.NewIter(&pebble.IterOptions{ LowerBound: MVCCEncode(prefixedKey, version), @@ -947,37 +861,6 @@ func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64, col return decodeMVCCEntry(itr.Key(), itr.Value(), prefixedKey, version) } -func getMVCCSliceWithSession(session *historicalReadSession, storeKey string, key []byte, version int64, collector types.ReadTraceCollector) ([]byte, error) { - totalStart := time.Now() - defer traceGetMVCCSlice(collector, storeKey, key, totalStart) - - prefixedKey := prependStoreKey(storeKey, key) - seekKey := MVCCEncode(prefixedKey, version) - - itr, _, _, err := session.getOrCreateIterator(storeKey) - if err != nil { - return nil, err - } - - session.mu.Lock() - ok := itr.SeekGE(seekKey) - var rawIterKey, rawIterValue []byte - if ok { - rawIterKey = slices.Clone(itr.Key()) - rawIterValue = itr.Value() // cloned by decodeMVCCEntry on the hit path - } - iterErr := itr.Error() - session.mu.Unlock() - - if iterErr != nil { - return nil, iterErr - } - if !ok { - return nil, errorutils.ErrRecordNotFound - } - return decodeMVCCEntry(rawIterKey, rawIterValue, prefixedKey, version) -} - // decodeMVCCEntry validates that the iterator's current entry belongs to // prefixedKey at a version <= target and returns a safe copy of the value. func decodeMVCCEntry(rawIterKey, rawIterValue, prefixedKey []byte, version int64) ([]byte, error) { @@ -998,126 +881,6 @@ func decodeMVCCEntry(rawIterKey, rawIterValue, prefixedKey []byte, version int64 return slices.Clone(rawIterValue), nil } -func traceGetMVCCSlice(collector types.ReadTraceCollector, storeKey string, key []byte, start time.Time) { - if collector == nil { - return - } - collector.RecordReadTrace(types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "getMVCCSlice", - DurationNanos: time.Since(start).Nanoseconds(), - Key: slices.Clone(key), - }) -} - -func (db *Database) WithReadTraceCollector(collector types.ReadTraceCollector) types.StateStore { - if collector == nil { - return db - } - session := newHistoricalReadSession(db.storage) - traced := &tracedDatabase{Database: db, collector: collector, readSession: session} - if registry, ok := collector.(readTraceCloserRegistry); ok { - registry.AddReadTraceCloser(session) - } - return traced -} - -func (db *tracedDatabase) Get(storeKey string, version int64, key []byte) ([]byte, error) { - return db.getWithSession(storeKey, version, key) -} - -func (db *tracedDatabase) Has(storeKey string, version int64, key []byte) (bool, error) { - start := time.Now() - defer func() { - recordReadTrace(db.collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "has", - DurationNanos: time.Since(start).Nanoseconds(), - Key: slices.Clone(key), - }) - }() - val, err := db.getWithSession(storeKey, version, key) - if err != nil { - return false, err - } - return val != nil, nil -} - -func (db *tracedDatabase) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - return db.iteratorWithCollector(storeKey, version, start, end, db.collector) -} - -func (db *tracedDatabase) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - return db.reverseIteratorWithCollector(storeKey, version, start, end, db.collector) -} - -func recordReadTrace(collector types.ReadTraceCollector, event types.ReadTraceEvent) { - if collector == nil { - return - } - collector.RecordReadTrace(event) -} - -func (db *tracedDatabase) getWithSession(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { - startTime := time.Now() - defer func() { - otelMetrics.getLatency.Record( - context.Background(), - time.Since(startTime).Seconds(), - metric.WithAttributes( - attribute.Bool("success", _err == nil), - attribute.String("store", storeKey), - ), - ) - recordReadTrace(db.collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "get", - DurationNanos: time.Since(startTime).Nanoseconds(), - Key: slices.Clone(key), - }) - }() - if targetVersion < db.GetEarliestVersion() { - return nil, nil - } - - if val, found := db.readSession.lookup(storeKey, targetVersion, key); found { - recordReadTrace(db.collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "readCacheHit", - DurationNanos: 0, - Key: slices.Clone(key), - }) - return val, nil - } - recordReadTrace(db.collector, types.ReadTraceEvent{ - StoreKey: storeKey, - Layer: "mvcc", - Operation: "readCacheMiss", - DurationNanos: 0, - Key: slices.Clone(key), - }) - - prefixedVal, err := getMVCCSliceWithSession(db.readSession, storeKey, key, targetVersion, db.collector) - if err != nil { - if errors.Is(err, errorutils.ErrRecordNotFound) { - db.readSession.store(storeKey, targetVersion, key, nil) - return nil, nil - } - return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) - } - - val, err := visibleValueAtVersion(prefixedVal, targetVersion) - if err != nil { - return nil, err - } - db.readSession.store(storeKey, targetVersion, key, val) - return val, nil -} - func visibleValueAtVersion(prefixedVal []byte, targetVersion int64) ([]byte, error) { valBz, tombBz, ok := SplitMVCCKey(prefixedVal) if !ok { @@ -1136,80 +899,6 @@ func visibleValueAtVersion(prefixedVal []byte, targetVersion int64) ([]byte, err return nil, nil } -func newHistoricalReadSession(db *pebble.DB) *historicalReadSession { - session := &historicalReadSession{ - snapshot: db.NewSnapshot(), - iterators: map[string]*pebble.Iterator{}, - cache: map[historicalReadCacheKey]historicalReadCacheValue{}, - } - return session -} - -func (s *historicalReadSession) lookup(storeKey string, version int64, key []byte) ([]byte, bool) { - s.mu.Lock() - defer s.mu.Unlock() - entry, ok := s.cache[historicalReadCacheKey{storeKey: storeKey, version: version, key: string(key)}] - if !ok { - return nil, false - } - if !entry.found { - return nil, true - } - return slices.Clone(entry.value), true -} - -func (s *historicalReadSession) store(storeKey string, version int64, key []byte, value []byte) { - s.mu.Lock() - defer s.mu.Unlock() - cacheValue := historicalReadCacheValue{found: value != nil} - if value != nil { - cacheValue.value = slices.Clone(value) - } - s.cache[historicalReadCacheKey{storeKey: storeKey, version: version, key: string(key)}] = cacheValue -} - -func (s *historicalReadSession) getOrCreateIterator(storeKey string) (*pebble.Iterator, bool, time.Duration, error) { - s.mu.Lock() - defer s.mu.Unlock() - if itr, ok := s.iterators[storeKey]; ok { - return itr, false, 0, nil - } - start := time.Now() - itr, err := s.snapshot.NewIter(&pebble.IterOptions{ - LowerBound: MVCCEncode(prependStoreKey(storeKey, nil), 0), - UpperBound: iteratorUpperBoundForStore(storeKey), - }) - if err != nil { - return nil, false, 0, err - } - s.iterators[storeKey] = itr - return itr, true, time.Since(start), nil -} - -func (s *historicalReadSession) Close() error { - s.mu.Lock() - defer s.mu.Unlock() - if s.closed { - return nil - } - s.closed = true - var lastErr error - for _, itr := range s.iterators { - if err := itr.Close(); err != nil { - lastErr = err - } - } - if s.snapshot != nil { - if err := s.snapshot.Close(); err != nil { - lastErr = err - } - } - s.iterators = nil - s.cache = nil - s.snapshot = nil - return lastErr -} - func iteratorUpperBoundForStore(storeKey string) []byte { upperStorePrefix := prefixEnd(storePrefix(storeKey)) if upperStorePrefix == nil { diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator.go b/sei-db/db_engine/pebbledb/mvcc/iterator.go index e45ecee162..60d4d47f25 100644 --- a/sei-db/db_engine/pebbledb/mvcc/iterator.go +++ b/sei-db/db_engine/pebbledb/mvcc/iterator.go @@ -6,7 +6,6 @@ import ( "fmt" "math" "sync" - "time" "github.com/cockroachdb/pebble/v2" "go.opentelemetry.io/otel/attribute" @@ -32,30 +31,27 @@ type iterator struct { reverse bool iterationCount int64 storeKey string - collector types.ReadTraceCollector closeSync sync.Once } -func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version int64, earliestVersion int64, reverse bool, storeKey string, collector types.ReadTraceCollector) *iterator { +func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version int64, earliestVersion int64, reverse bool, storeKey string) *iterator { // Return invalid iterator if requested iterator height is lower than earliest version after pruning if version < earliestVersion { return &iterator{ - source: src, - prefix: prefix, - start: mvccStart, - end: mvccEnd, - version: version, - valid: false, - reverse: reverse, - storeKey: storeKey, - collector: collector, + source: src, + prefix: prefix, + start: mvccStart, + end: mvccEnd, + version: version, + valid: false, + reverse: reverse, + storeKey: storeKey, } } // move the underlying PebbleDB iterator to the first key var valid bool - positionStart := time.Now() if reverse { valid = src.Last() } else { @@ -63,20 +59,14 @@ func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte } itr := &iterator{ - source: src, - prefix: prefix, - start: mvccStart, - end: mvccEnd, - version: version, - valid: valid, - reverse: reverse, - storeKey: storeKey, - collector: collector, - } - if reverse { - itr.recordPebbleOp("last", time.Since(positionStart), nil) - } else { - itr.recordPebbleOp("first", time.Since(positionStart), nil) + source: src, + prefix: prefix, + start: mvccStart, + end: mvccEnd, + version: version, + valid: valid, + reverse: reverse, + storeKey: storeKey, } if valid { @@ -107,9 +97,7 @@ func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte func (itr *iterator) seekVisibleVersionForKey(targetKey []byte) bool { seekKey := MVCCEncode(targetKey, itr.version) - seekStart := time.Now() valid := itr.source.SeekGE(seekKey) - itr.recordPebbleOp("seekGE", time.Since(seekStart), seekKey) if !valid { return false } @@ -134,9 +122,7 @@ func (itr *iterator) nextLogicalKey(currKey []byte) ([]byte, bool) { return nil, false } seekKey := MVCCEncode(nextKeyPrefix, math.MaxInt64) - seekStart := time.Now() valid := itr.source.SeekGE(seekKey) - itr.recordPebbleOp("seekGE", time.Since(seekStart), seekKey) if !valid { return nil, false } @@ -149,9 +135,7 @@ func (itr *iterator) nextLogicalKey(currKey []byte) ([]byte, bool) { func (itr *iterator) prevLogicalKey(currKey []byte) ([]byte, bool) { seekKey := MVCCEncode(currKey, math.MaxInt64) - seekStart := time.Now() valid := itr.source.SeekLT(seekKey) - itr.recordPebbleOp("seekLT", time.Since(seekStart), seekKey) if !valid { return nil, false } @@ -420,14 +404,3 @@ func (itr *iterator) DebugRawIterate() { } } } - -func (itr *iterator) recordPebbleOp(operation string, duration time.Duration, key []byte) { - recordReadTrace(itr.collector, types.ReadTraceEvent{ - StoreKey: itr.storeKey, - Layer: "pebble", - Operation: operation, - DurationNanos: duration.Nanoseconds(), - Key: slices.Clone(key), - Reverse: itr.reverse, - }) -} diff --git a/sei-db/db_engine/types/types.go b/sei-db/db_engine/types/types.go index 6665c3982b..161017381d 100644 --- a/sei-db/db_engine/types/types.go +++ b/sei-db/db_engine/types/types.go @@ -154,25 +154,6 @@ type StateStore interface { io.Closer } -type ReadTraceEvent struct { - StoreKey string - Layer string - Operation string - DurationNanos int64 - Key []byte - Start []byte - End []byte - Reverse bool -} - -type ReadTraceCollector interface { - RecordReadTrace(ReadTraceEvent) -} - -type TraceableStateStore interface { - WithReadTraceCollector(ReadTraceCollector) StateStore -} - // DBIterator iterates over versioned key-value pairs. type DBIterator interface { Domain() (start []byte, end []byte) From 33ce175e0230e9856459f15e31e5aee8803a2f5f Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 12:29:24 -0400 Subject: [PATCH 09/15] feat: gate pebbledb OpenDB on descending-MVCC sentinel marker Reject legacy ascending-version databases on open. On a populated DB lacking the s/_mvcc_descending marker, return an error that points operators at state sync; on an empty DB, write the marker so future opens fast-path. Protects against silent version mis-decoding after an in-place upgrade to the new encoding. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/db_engine/pebbledb/mvcc/db.go | 40 ++++++++++- .../pebbledb/mvcc/descending_marker_test.go | 72 +++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 5135e07afb..ae3f8c0968 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -35,7 +35,12 @@ const ( StorePrefixTpl = "s/k:%s/" // s/k: latestVersionKey = "s/_latest" earliestVersionKey = "s/_earliest" - tombstoneVal = "TOMBSTONE" + // descendingMVCCMarkerKey flags that the DB was initialized with the + // descending-version MVCC encoding. Its absence on a populated DB means + // the data was written by the legacy ascending-version build and is not + // safe to read with this code. + descendingMVCCMarkerKey = "s/_mvcc_descending" + tombstoneVal = "TOMBSTONE" // TODO: Make configurable ImportCommitBatchSize = 10000 @@ -140,6 +145,11 @@ func OpenDB(dataDir string, config config.StateStoreConfig) (types.StateStore, e return nil, fmt.Errorf("failed to open PebbleDB: %w", err) } + if err := assertDescendingMVCCOrCreate(db); err != nil { + _ = db.Close() + return nil, err + } + // Initialize earliest version earliestVersion, err := retrieveEarliestVersion(db) if err != nil { @@ -238,6 +248,34 @@ func (db *Database) GetLatestVersion() int64 { } // Retrieve latestVersion from db, if not found, return 0. +// assertDescendingMVCCOrCreate refuses to open a populated DB that lacks the +// descendingMVCCMarkerKey sentinel (i.e. one written by the legacy +// ascending-version build). Empty DBs are marked and allowed through. +func assertDescendingMVCCOrCreate(db *pebble.DB) error { + if _, closer, err := db.Get([]byte(descendingMVCCMarkerKey)); err == nil { + _ = closer.Close() + return nil + } else if !errors.Is(err, pebble.ErrNotFound) { + return fmt.Errorf("reading descending-MVCC marker: %w", err) + } + + if _, closer, err := db.Get([]byte(latestVersionKey)); err == nil { + _ = closer.Close() + return fmt.Errorf( + "pebbledb at this path was created with ascending-version MVCC and " + + "is incompatible with this build's descending-version encoding; " + + "state sync required to rebuild the state store", + ) + } else if !errors.Is(err, pebble.ErrNotFound) { + return fmt.Errorf("reading latest version marker: %w", err) + } + + if err := db.Set([]byte(descendingMVCCMarkerKey), []byte{1}, defaultWriteOpts); err != nil { + return fmt.Errorf("writing descending-MVCC marker: %w", err) + } + return nil +} + func retrieveLatestVersion(db *pebble.DB) (int64, error) { bz, closer, err := db.Get([]byte(latestVersionKey)) defer func() { diff --git a/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go b/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go new file mode 100644 index 0000000000..24a3b9290c --- /dev/null +++ b/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go @@ -0,0 +1,72 @@ +package mvcc + +import ( + "encoding/binary" + "testing" + + "github.com/cockroachdb/pebble/v2" + "github.com/stretchr/testify/require" + + "github.com/sei-protocol/sei-chain/sei-db/config" +) + +// TestDescendingMVCCMarker_FreshDBWritesMarker verifies that opening an empty +// pebbledb writes the descending-MVCC sentinel so subsequent opens fast-path. +func TestDescendingMVCCMarker_FreshDBWritesMarker(t *testing.T) { + dir := t.TempDir() + cfg := config.DefaultStateStoreConfig() + cfg.Backend = "pebbledb" + + store, err := OpenDB(dir, cfg) + require.NoError(t, err) + require.NoError(t, store.Close()) + + // Reopen the raw pebble DB and check the sentinel is there. + raw, err := pebble.Open(dir, &pebble.Options{Comparer: MVCCComparer}) + require.NoError(t, err) + defer func() { _ = raw.Close() }() + + val, closer, err := raw.Get([]byte(descendingMVCCMarkerKey)) + require.NoError(t, err) + require.NotEmpty(t, val) + require.NoError(t, closer.Close()) +} + +// TestDescendingMVCCMarker_LegacyDBRejected simulates a DB written by the old +// ascending-version build (latestVersionKey present, no marker) and asserts we +// refuse to open it rather than silently returning wrong versions. +func TestDescendingMVCCMarker_LegacyDBRejected(t *testing.T) { + dir := t.TempDir() + + raw, err := pebble.Open(dir, &pebble.Options{Comparer: MVCCComparer}) + require.NoError(t, err) + var ts [VersionSize]byte + binary.LittleEndian.PutUint64(ts[:], uint64(42)) + require.NoError(t, raw.Set([]byte(latestVersionKey), ts[:], pebble.Sync)) + require.NoError(t, raw.Close()) + + cfg := config.DefaultStateStoreConfig() + cfg.Backend = "pebbledb" + + _, err = OpenDB(dir, cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "state sync required") +} + +// TestDescendingMVCCMarker_RoundTrip writes data with OpenDB, reopens, and +// confirms the second open succeeds (marker is honored, no false rejection). +func TestDescendingMVCCMarker_RoundTrip(t *testing.T) { + dir := t.TempDir() + cfg := config.DefaultStateStoreConfig() + cfg.Backend = "pebbledb" + + store, err := OpenDB(dir, cfg) + require.NoError(t, err) + db := store.(*Database) + applyVersion(t, db, "store1", 1, []byte("k"), []byte("v")) + require.NoError(t, db.Close()) + + store2, err := OpenDB(dir, cfg) + require.NoError(t, err) + require.NoError(t, store2.Close()) +} From 7de931aba1ec8c14d06a2504532276fea0e6734b Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 13:03:36 -0400 Subject: [PATCH 10/15] feat: auto-detect MVCC encoding, keep ascending path for legacy DBs Archive nodes that cannot state-sync need to keep reading and writing their existing ascending-version MVCC databases. Rather than forcing a migration, detect the on-disk encoding at open time and dispatch to the matching code path. - OpenDB sets Database.descending from the s/_mvcc_descending sentinel: present or empty DB -> descending (fast path, marker written on fresh DBs); marker absent on a populated DB -> ascending legacy mode, no error, no marker write. - Public Get/Has/Iterator/ReverseIterator/Prune dispatch to *Descending or *Ascending implementations. Batch threads the mode through writes (NewBatchWithMode) so ApplyChangeset / Import / DeleteKeysAtVersion persist in the DB's native encoding. - File layout makes the split obvious for review: - db_descending.go, iterator_descending.go -- perf fast path (this branch's rewrite) - db_ascending.go, iterator_ascending.go -- verbatim port of main's legacy implementation, only adjusted to use MVCCEncodeAscending and the ascendingIterator type - comparator.go -- both encode/decode variants plus a parameterized MVCCEncode(key, v, descending) for shared call sites - Tests cover all three open paths (fresh-writes-marker, legacy-opens- ascending, marked-roundtrip) including a write+read+reopen cycle on a seeded legacy DB to confirm the ascending path is end-to-end correct. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/db_engine/pebbledb/mvcc/batch.go | 48 ++- sei-db/db_engine/pebbledb/mvcc/comparator.go | 61 ++- sei-db/db_engine/pebbledb/mvcc/db.go | 388 ++++------------- .../db_engine/pebbledb/mvcc/db_ascending.go | 297 +++++++++++++ .../db_engine/pebbledb/mvcc/db_descending.go | 323 ++++++++++++++ .../pebbledb/mvcc/descending_marker_test.go | 78 +++- .../pebbledb/mvcc/iterator_ascending.go | 398 ++++++++++++++++++ .../{iterator.go => iterator_descending.go} | 6 +- sei-db/db_engine/pebbledb/mvcc/prune_test.go | 4 +- 9 files changed, 1260 insertions(+), 343 deletions(-) create mode 100644 sei-db/db_engine/pebbledb/mvcc/db_ascending.go create mode 100644 sei-db/db_engine/pebbledb/mvcc/db_descending.go create mode 100644 sei-db/db_engine/pebbledb/mvcc/iterator_ascending.go rename sei-db/db_engine/pebbledb/mvcc/{iterator.go => iterator_descending.go} (98%) diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index cc06acc46b..261c913453 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -14,9 +14,10 @@ import ( ) type Batch struct { - storage *pebble.DB - version int64 - ops []batchOp + storage *pebble.DB + version int64 + ops []batchOp + descending bool } type batchOp struct { @@ -25,14 +26,22 @@ type batchOp struct { delete bool } +// NewBatch creates a new descending-mode Batch. Callers that need ascending-mode +// encoding for legacy DBs should use NewBatchWithMode. func NewBatch(storage *pebble.DB, version int64) (*Batch, error) { + return NewBatchWithMode(storage, version, true) +} + +// NewBatchWithMode creates a new Batch using the supplied MVCC encoding mode. +func NewBatchWithMode(storage *pebble.DB, version int64, descending bool) (*Batch, error) { if version < 0 { return nil, fmt.Errorf("version must be non-negative") } b := &Batch{ - storage: storage, - version: version, - ops: make([]batchOp, 0, 16), + storage: storage, + version: version, + ops: make([]batchOp, 0, 16), + descending: descending, } return b, nil } @@ -46,8 +55,8 @@ func (b *Batch) Reset() { } func (b *Batch) set(storeKey string, tombstone int64, key, value []byte) error { - prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), b.version) - prefixedVal := MVCCEncode(value, tombstone) + prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), b.version, b.descending) + prefixedVal := MVCCEncode(value, tombstone, b.descending) b.appendSet(prefixedKey, prefixedVal) return nil @@ -104,14 +113,23 @@ func (b *Batch) Write() (err error) { // For writing kv pairs in any order of version type RawBatch struct { - storage *pebble.DB - ops []batchOp + storage *pebble.DB + ops []batchOp + descending bool } +// NewRawBatch creates a new descending-mode RawBatch. func NewRawBatch(storage *pebble.DB) (*RawBatch, error) { + return NewRawBatchWithMode(storage, true) +} + +// NewRawBatchWithMode creates a new RawBatch using the supplied MVCC encoding +// mode. +func NewRawBatchWithMode(storage *pebble.DB, descending bool) (*RawBatch, error) { return &RawBatch{ - storage: storage, - ops: make([]batchOp, 0, 16), + storage: storage, + ops: make([]batchOp, 0, 16), + descending: descending, }, nil } @@ -124,8 +142,8 @@ func (b *RawBatch) Reset() { } func (b *RawBatch) set(storeKey string, tombstone int64, key, value []byte, version int64) error { - prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), version) - prefixedVal := MVCCEncode(value, tombstone) + prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), version, b.descending) + prefixedVal := MVCCEncode(value, tombstone, b.descending) b.appendSet(prefixedKey, prefixedVal) return nil @@ -142,7 +160,7 @@ func (b *RawBatch) Delete(storeKey string, key []byte, version int64) error { // HardDelete physically removes the key by encoding it with the batch’s version // and calling the underlying pebble.Batch.Delete. func (b *Batch) HardDelete(storeKey string, key []byte) error { - fullKey := MVCCEncode(prependStoreKey(storeKey, key), b.version) + fullKey := MVCCEncode(prependStoreKey(storeKey, key), b.version, b.descending) b.appendDelete(fullKey) return nil } diff --git a/sei-db/db_engine/pebbledb/mvcc/comparator.go b/sei-db/db_engine/pebbledb/mvcc/comparator.go index a6b8afab61..4d0af09233 100644 --- a/sei-db/db_engine/pebbledb/mvcc/comparator.go +++ b/sei-db/db_engine/pebbledb/mvcc/comparator.go @@ -208,8 +208,23 @@ func MVCCKeyCompare(a, b []byte) int { return bytes.Compare(aTS, bTS) } +// MVCCEncode dispatches between the descending and ascending encoders based on +// the mode flag. Descending-mode is used for fresh DBs created by this build; +// ascending-mode preserves compatibility with legacy DBs written by the +// previous ascending-version build. +func MVCCEncode(key []byte, version int64, descending bool) []byte { + if descending { + return MVCCEncodeDescending(key, version) + } + return MVCCEncodeAscending(key, version) +} + +// MVCCEncodeDescending encodes an MVCC key with the version encoded in +// descending byte order so newer versions sort before older ones for the same +// logical key. +// // \x00[]<#version-bytes> -func MVCCEncode(key []byte, version int64) (dst []byte) { +func MVCCEncodeDescending(key []byte, version int64) (dst []byte) { dst = append(dst, key...) dst = append(dst, 0) @@ -222,6 +237,23 @@ func MVCCEncode(key []byte, version int64) (dst []byte) { return dst } +// MVCCEncodeAscending encodes an MVCC key with the version encoded in +// ascending byte order. This matches the legacy on-disk format used by main. +// +// \x00[]<#version-bytes> +func MVCCEncodeAscending(key []byte, version int64) (dst []byte) { + dst = append(dst, key...) + dst = append(dst, 0) + + if version > 0 { + extra := byte(1 + 8) + dst = encodeUint64Ascending(dst, uint64(version)) + dst = append(dst, extra) + } + + return dst +} + // encodeUint64Descending encodes the uint64 value in descending order so newer // versions sort before older versions for the same logical key. func encodeUint64Descending(dst []byte, v uint64) []byte { @@ -248,3 +280,30 @@ func decodeUint64Descending(b []byte) (int64, error) { v := int64(uv) return v, nil } + +// encodeUint64Ascending encodes the uint64 value using a big-endian 8 byte +// representation. The bytes are appended to the supplied buffer and +// the final buffer is returned. +func encodeUint64Ascending(dst []byte, v uint64) []byte { + return append( + dst, + byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32), + byte(v>>24), byte(v>>16), byte(v>>8), byte(v), + ) +} + +// decodeUint64Ascending decodes a int64 from the input buffer, treating +// the input as a big-endian 8 byte uint64 representation. The decoded int64 is +// returned. +func decodeUint64Ascending(b []byte) (int64, error) { + if len(b) < 8 { + return 0, fmt.Errorf("insufficient bytes to decode uint64 int value; expected 8; got %d", len(b)) + } + + uv := binary.BigEndian.Uint64(b) + if uv > math.MaxInt64 { + return 0, fmt.Errorf("uint64 value overflows int64: %d", uv) + } + v := int64(uv) + return v, nil +} diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index ae3f8c0968..04882ea33e 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -17,9 +17,7 @@ import ( "github.com/cockroachdb/pebble/v2/sstable" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" - "golang.org/x/exp/slices" - errorutils "github.com/sei-protocol/sei-chain/sei-db/common/errors" "github.com/sei-protocol/sei-chain/sei-db/common/utils" "github.com/sei-protocol/sei-chain/sei-db/config" "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" @@ -63,6 +61,12 @@ type Database struct { earliestVersion atomic.Int64 // Latest version for db latestVersion atomic.Int64 + // descending indicates whether this DB uses descending-version MVCC + // encoding (fresh DBs created by this build) or the legacy + // ascending-version encoding (DBs created by the previous build). The + // mode is detected on open and is immutable for the lifetime of the + // Database. + descending bool // Map of module to when each was last updated // Used in pruning to skip over stores that have not been updated recently @@ -145,7 +149,8 @@ func OpenDB(dataDir string, config config.StateStoreConfig) (types.StateStore, e return nil, fmt.Errorf("failed to open PebbleDB: %w", err) } - if err := assertDescendingMVCCOrCreate(db); err != nil { + descending, err := detectMVCCMode(db) + if err != nil { _ = db.Close() return nil, err } @@ -170,6 +175,7 @@ func OpenDB(dataDir string, config config.StateStoreConfig) (types.StateStore, e config: config, earliestVersion: atomic.Int64{}, latestVersion: atomic.Int64{}, + descending: descending, pendingChanges: make(chan VersionedChangesets, config.AsyncWriteBuffer), } database.latestVersion.Store(latestVersion) @@ -223,6 +229,24 @@ func (db *Database) Close() error { return err } +// mvccEncode encodes a key with the MVCC version encoding matching this +// Database's on-disk mode. +func (db *Database) mvccEncode(key []byte, version int64) []byte { + if db.descending { + return MVCCEncodeDescending(key, version) + } + return MVCCEncodeAscending(key, version) +} + +// decodeVersion decodes an on-disk MVCC version using the encoding matching +// this Database's mode. +func (db *Database) decodeVersion(vBz []byte) (int64, error) { + if db.descending { + return decodeUint64Descending(vBz) + } + return decodeUint64Ascending(vBz) +} + // PebbleMetrics returns the underlying Pebble DB metrics for observability (e.g. compaction/flush counts). // Returns nil if the database is closed. func (db *Database) PebbleMetrics() *pebble.Metrics { @@ -247,33 +271,37 @@ func (db *Database) GetLatestVersion() int64 { return db.latestVersion.Load() } -// Retrieve latestVersion from db, if not found, return 0. -// assertDescendingMVCCOrCreate refuses to open a populated DB that lacks the -// descendingMVCCMarkerKey sentinel (i.e. one written by the legacy -// ascending-version build). Empty DBs are marked and allowed through. -func assertDescendingMVCCOrCreate(db *pebble.DB) error { +// detectMVCCMode inspects the DB to determine which MVCC encoding to use. +// +// - If the descendingMVCCMarkerKey sentinel is present, the DB was created +// by this build and is in descending mode. +// - If the marker is absent but latestVersionKey is present, the DB was +// populated by the legacy ascending-version build. We open it in +// ascending mode without writing the marker (legacy DBs stay unmarked +// forever). +// - If both markers are absent the DB is fresh; we write the descending +// marker and return descending mode. +func detectMVCCMode(db *pebble.DB) (bool, error) { if _, closer, err := db.Get([]byte(descendingMVCCMarkerKey)); err == nil { _ = closer.Close() - return nil + return true, nil } else if !errors.Is(err, pebble.ErrNotFound) { - return fmt.Errorf("reading descending-MVCC marker: %w", err) + return false, fmt.Errorf("reading descending-MVCC marker: %w", err) } if _, closer, err := db.Get([]byte(latestVersionKey)); err == nil { _ = closer.Close() - return fmt.Errorf( - "pebbledb at this path was created with ascending-version MVCC and " + - "is incompatible with this build's descending-version encoding; " + - "state sync required to rebuild the state store", - ) + // Legacy DB: no marker, has data. Open in ascending mode. + return false, nil } else if !errors.Is(err, pebble.ErrNotFound) { - return fmt.Errorf("reading latest version marker: %w", err) + return false, fmt.Errorf("reading latest version marker: %w", err) } + // Fresh DB: mark it and use descending mode. if err := db.Set([]byte(descendingMVCCMarkerKey), []byte{1}, defaultWriteOpts); err != nil { - return fmt.Errorf("writing descending-MVCC marker: %w", err) + return false, fmt.Errorf("writing descending-MVCC marker: %w", err) } - return nil + return true, nil } func retrieveLatestVersion(db *pebble.DB) (int64, error) { @@ -341,45 +369,22 @@ func retrieveEarliestVersion(db *pebble.DB) (int64, error) { return int64(ubz), nil } +// Has dispatches between descending- and ascending-mode implementations +// depending on the on-disk encoding detected at open time. func (db *Database) Has(storeKey string, version int64, key []byte) (bool, error) { - if version < db.GetEarliestVersion() { - return false, nil - } - - val, err := db.Get(storeKey, version, key) - if err != nil { - return false, err + if db.descending { + return db.hasDescending(storeKey, version, key) } - - return val != nil, nil + return db.hasAscending(storeKey, version, key) } -func (db *Database) Get(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { - startTime := time.Now() - defer func() { - otelMetrics.getLatency.Record( - context.Background(), - time.Since(startTime).Seconds(), - metric.WithAttributes( - attribute.Bool("success", _err == nil), - attribute.String("store", storeKey), - ), - ) - }() - if targetVersion < db.GetEarliestVersion() { - return nil, nil - } - - prefixedVal, err := getMVCCSlice(db.storage, storeKey, key, targetVersion) - if err != nil { - if errors.Is(err, errorutils.ErrRecordNotFound) { - return nil, nil - } - - return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) +// Get dispatches between descending- and ascending-mode implementations +// depending on the on-disk encoding detected at open time. +func (db *Database) Get(storeKey string, targetVersion int64, key []byte) ([]byte, error) { + if db.descending { + return db.getDescending(storeKey, targetVersion, key) } - - return visibleValueAtVersion(prefixedVal, targetVersion) + return db.getAscending(storeKey, targetVersion, key) } func (db *Database) ApplyChangesetSync(version int64, changeset []*proto.NamedChangeSet) (_err error) { @@ -399,7 +404,7 @@ func (db *Database) ApplyChangesetSync(version int64, changeset []*proto.NamedCh } // Create batch and persist latest version in the batch - b, err := NewBatch(db.storage, version) + b, err := NewBatchWithMode(db.storage, version, db.descending) if err != nil { return err } @@ -483,161 +488,31 @@ func (db *Database) WaitForPendingWrites() { <-done } -// Prune attempts to prune all versions up to and including the current version -// Get the range of keys, manually iterate over them and delete them -// We add a heuristic to skip over a module's keys during pruning if it hasn't been updated -// since the last time pruning occurred. -// NOTE: There is a rare case when a module's keys are skipped during pruning even though -// it has been updated. This occurs when that module's keys are updated in between pruning runs, the node after is restarted. -// This is not a large issue given the next time that module is updated, it will be properly pruned thereafter. -func (db *Database) Prune(version int64) (_err error) { - // Defensive check: ensure database is not closed - if db.storage == nil { - return errors.New("pebbledb: database is closed") - } - - startTime := time.Now() - defer func() { - otelMetrics.pruneLatency.Record( - context.Background(), - time.Since(startTime).Seconds(), - metric.WithAttributes( - attribute.Bool("success", _err == nil), - ), - ) - }() - - earliestVersion := version + 1 // we increment by 1 to include the provided version - - itr, err := db.storage.NewIter(nil) - if err != nil { - return err - } - defer func() { _ = itr.Close() }() - - batch := db.storage.NewBatch() - defer func() { _ = batch.Close() }() - - var ( - counter int - prevKey []byte - keptBelowPrune bool - prevStore string - ) - - for itr.First(); itr.Valid(); { - currKeyEncoded := slices.Clone(itr.Key()) - - // Ignore metadata entries during pruning - if isMetadataKey(currKeyEncoded) { - itr.Next() - continue - } - - // Store current key and version - currKey, currVersion, currOK := SplitMVCCKey(currKeyEncoded) - if !currOK { - return fmt.Errorf("invalid MVCC key") - } - - storeKey, err := parseStoreKey(currKey) - if err != nil { - // XXX: This should never happen given we skip the metadata keys. - return err - } - - // For every new module visited, check to see last time it was updated - if storeKey != prevStore { - prevStore = storeKey - updated, ok := db.storeKeyDirty.Load(storeKey) - versionUpdated, typeOk := updated.(int64) - // Skip a store's keys if version it was last updated is less than last prune height - if !ok || (typeOk && versionUpdated < db.GetEarliestVersion()) { - itr.SeekGE(storePrefix(storeKey + "0")) - continue - } - } - - currVersionDecoded, err := decodeUint64Descending(currVersion) - if err != nil { - return err - } - - // Reset per-logical-key state when the logical key changes. - if !bytes.Equal(prevKey, currKey) { - prevKey = slices.Clone(currKey) - keptBelowPrune = false - - // Fast path: under descending encoding, versions of a key are stored - // newest-first. When the newest real version is above the prune - // height, seek directly to the first version <= prune height for - // this key instead of iterating through every above-prune version. - if currVersionDecoded > version { - itr.SeekGE(MVCCEncode(currKey, version)) - continue - } - } - - // Descending iteration: for a given logical key we see newest→oldest. - // Versions > prune height are always kept. For versions <= prune - // height, keep only the newest one when KeepLastVersion is true; - // delete every other such version. - if currVersionDecoded <= version { - if db.config.KeepLastVersion && !keptBelowPrune { - keptBelowPrune = true - } else { - if err := batch.Delete(currKeyEncoded, nil); err != nil { - return err - } - counter++ - if counter >= PruneCommitBatchSize { - if err := batch.Commit(defaultWriteOpts); err != nil { - return err - } - counter = 0 - batch.Reset() - } - } - } - - itr.Next() +// Prune dispatches between descending- and ascending-mode implementations +// depending on the on-disk encoding detected at open time. +func (db *Database) Prune(version int64) error { + if db.descending { + return db.pruneDescending(version) } - - // Commit any leftover delete ops in batch - if counter > 0 { - err = batch.Commit(defaultWriteOpts) - if err != nil { - return err - } - } - - return db.SetEarliestVersion(earliestVersion, false) + return db.pruneAscending(version) } +// Iterator dispatches between descending- and ascending-mode implementations +// depending on the on-disk encoding detected at open time. func (db *Database) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { - return nil, errorutils.ErrKeyEmpty - } - - if start != nil && end != nil && bytes.Compare(start, end) > 0 { - return nil, errorutils.ErrStartAfterEnd - } - - lowerBound := MVCCEncode(prependStoreKey(storeKey, start), 0) - - var upperBound []byte - if end != nil { - upperBound = MVCCEncode(prependStoreKey(storeKey, end), 0) - } else { - upperBound = iteratorUpperBoundForStore(storeKey) + if db.descending { + return db.iteratorDescending(storeKey, version, start, end) } + return db.iteratorAscending(storeKey, version, start, end) +} - itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) - if err != nil { - return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) +// ReverseIterator dispatches between descending- and ascending-mode +// implementations depending on the on-disk encoding detected at open time. +func (db *Database) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if db.descending { + return db.reverseIteratorDescending(storeKey, version, start, end) } - - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil + return db.reverseIteratorAscending(storeKey, version, start, end) } // Taken from pebbledb prefix upper bound @@ -654,32 +529,6 @@ func prefixEnd(b []byte) []byte { return nil } -func (db *Database) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { - return nil, errorutils.ErrKeyEmpty - } - - if start != nil && end != nil && bytes.Compare(start, end) > 0 { - return nil, errorutils.ErrStartAfterEnd - } - - lowerBound := MVCCEncode(prependStoreKey(storeKey, start), 0) - - var upperBound []byte - if end != nil { - upperBound = MVCCEncode(prependStoreKey(storeKey, end), 0) - } else { - upperBound = MVCCEncode(prefixEnd(storePrefix(storeKey)), 0) - } - - itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) - if err != nil { - return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) - } - - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil -} - // Import loads the initial version of the state in parallel with numWorkers goroutines // TODO: Potentially add retries instead of panics func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err error) { @@ -698,7 +547,7 @@ func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err er worker := func() { defer wg.Done() - batch, err := NewBatch(db.storage, version) + batch, err := NewBatchWithMode(db.storage, version, db.descending) if err != nil { panic(err) } @@ -719,7 +568,7 @@ func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err er panic(err) } - batch, err = NewBatch(db.storage, version) + batch, err = NewBatchWithMode(db.storage, version, db.descending) if err != nil { panic(err) } @@ -746,7 +595,7 @@ func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err er // RawIterate iterates over all keys and values for a store func (db *Database) RawIterate(storeKey string, fn func(key []byte, value []byte, version int64) bool) (bool, error) { // Iterate through all keys and values for a store - lowerBound := MVCCEncode(prependStoreKey(storeKey, nil), 0) + lowerBound := db.mvccEncode(prependStoreKey(storeKey, nil), 0) prefix := storePrefix(storeKey) itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound}) @@ -777,7 +626,7 @@ func (db *Database) RawIterate(storeKey string, fn func(key []byte, value []byte // Parse prefix out of the key parsedKey := currKey[len(prefix):] - currVersionDecoded, err := decodeUint64Descending(currVersion) + currVersionDecoded, err := db.decodeVersion(currVersion) if err != nil { return false, err } @@ -804,7 +653,7 @@ func (db *Database) RawIterate(storeKey string, fn func(key []byte, value []byte func (db *Database) DeleteKeysAtVersion(module string, version int64) error { - batch, err := NewBatch(db.storage, version) + batch, err := NewBatchWithMode(db.storage, version, db.descending) if err != nil { return fmt.Errorf("failed to create deletion batch for module %q: %w", module, err) } @@ -824,7 +673,7 @@ func (db *Database) DeleteKeysAtVersion(module string, version int64) error { return true } deleteCounter = 0 - batch, err = NewBatch(db.storage, version) + batch, err = NewBatchWithMode(db.storage, version, db.descending) if err != nil { fmt.Printf("Error creating a new deletion batch for module %q: %v\n", module, err) return true @@ -880,79 +729,6 @@ func parseStoreKey(key []byte) (string, error) { return keyStr[LenPrefixStore : LenPrefixStore+slashIndex], nil } -func getMVCCSlice(db *pebble.DB, storeKey string, key []byte, version int64) (_ []byte, err error) { - prefixedKey := prependStoreKey(storeKey, key) - itr, err := db.NewIter(&pebble.IterOptions{ - LowerBound: MVCCEncode(prefixedKey, version), - UpperBound: iteratorUpperBoundForLogicalKey(prefixedKey), - }) - if err != nil { - return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) - } - defer func() { - err = errorutils.Join(err, itr.Close()) - }() - - if !itr.First() { - return nil, errorutils.ErrRecordNotFound - } - return decodeMVCCEntry(itr.Key(), itr.Value(), prefixedKey, version) -} - -// decodeMVCCEntry validates that the iterator's current entry belongs to -// prefixedKey at a version <= target and returns a safe copy of the value. -func decodeMVCCEntry(rawIterKey, rawIterValue, prefixedKey []byte, version int64) ([]byte, error) { - userKey, vBz, ok := SplitMVCCKey(rawIterKey) - if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) - } - if !bytes.Equal(userKey, prefixedKey) { - return nil, errorutils.ErrRecordNotFound - } - keyVersion, err := decodeUint64Descending(vBz) - if err != nil { - return nil, fmt.Errorf("failed to decode key version: %w", err) - } - if keyVersion > version { - return nil, errorutils.ErrRecordNotFound - } - return slices.Clone(rawIterValue), nil -} - -func visibleValueAtVersion(prefixedVal []byte, targetVersion int64) ([]byte, error) { - valBz, tombBz, ok := SplitMVCCKey(prefixedVal) - if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) - } - if len(tombBz) == 0 { - return valBz, nil - } - tombstone, err := decodeUint64Descending(tombBz) - if err != nil { - return nil, fmt.Errorf("failed to decode value tombstone: %w", err) - } - if targetVersion < tombstone { - return valBz, nil - } - return nil, nil -} - -func iteratorUpperBoundForStore(storeKey string) []byte { - upperStorePrefix := prefixEnd(storePrefix(storeKey)) - if upperStorePrefix == nil { - return nil - } - return MVCCEncode(upperStorePrefix, 0) -} - -func iteratorUpperBoundForLogicalKey(key []byte) []byte { - upperKeyPrefix := prefixEnd(key) - if upperKeyPrefix == nil { - return nil - } - return MVCCEncode(upperKeyPrefix, 0) -} - func valTombstoned(value []byte) bool { if value == nil { return false diff --git a/sei-db/db_engine/pebbledb/mvcc/db_ascending.go b/sei-db/db_engine/pebbledb/mvcc/db_ascending.go new file mode 100644 index 0000000000..ee116d3192 --- /dev/null +++ b/sei-db/db_engine/pebbledb/mvcc/db_ascending.go @@ -0,0 +1,297 @@ +package mvcc + +import ( + "bytes" + "context" + "errors" + "fmt" + "math" + "time" + + "github.com/cockroachdb/pebble/v2" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "golang.org/x/exp/slices" + + errorutils "github.com/sei-protocol/sei-chain/sei-db/common/errors" + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" +) + +// This file contains the ascending-version MVCC implementation used to read +// and write legacy DBs that were created before the descending-version fast +// path was introduced. It is a verbatim port of main's Get/Has/Iterator/ +// ReverseIterator/Prune path, adjusted only to use the *Ascending encoding +// helpers and the ascendingIterator type. Archive nodes that cannot migrate +// will continue to hit this path. + +func (db *Database) hasAscending(storeKey string, version int64, key []byte) (bool, error) { + if version < db.GetEarliestVersion() { + return false, nil + } + + val, err := db.getAscending(storeKey, version, key) + if err != nil { + return false, err + } + + return val != nil, nil +} + +func (db *Database) getAscending(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { + startTime := time.Now() + defer func() { + otelMetrics.getLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + attribute.String("store", storeKey), + ), + ) + }() + if targetVersion < db.GetEarliestVersion() { + return nil, nil + } + + prefixedVal, err := getMVCCSliceAscending(db.storage, storeKey, key, targetVersion) + if err != nil { + if errors.Is(err, errorutils.ErrRecordNotFound) { + return nil, nil + } + + return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) + } + + valBz, tombBz, ok := SplitMVCCKey(prefixedVal) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) + } + + // A tombstone of zero or a target version that is less than the tombstone + // version means the key is not deleted at the target version. + if len(tombBz) == 0 { + return valBz, nil + } + + tombstone, err := decodeUint64Ascending(tombBz) + if err != nil { + return nil, fmt.Errorf("failed to decode value tombstone: %w", err) + } + + // A tombstone of zero or a target version that is less than the tombstone + // version means the key is not deleted at the target version. + if targetVersion < tombstone { + return valBz, nil + } + + // the value is considered deleted + return nil, nil +} + +func (db *Database) pruneAscending(version int64) (_err error) { + // Defensive check: ensure database is not closed + if db.storage == nil { + return errors.New("pebbledb: database is closed") + } + + startTime := time.Now() + defer func() { + otelMetrics.pruneLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + ), + ) + }() + + earliestVersion := version + 1 // we increment by 1 to include the provided version + + itr, err := db.storage.NewIter(nil) + if err != nil { + return err + } + defer func() { _ = itr.Close() }() + + batch := db.storage.NewBatch() + defer func() { _ = batch.Close() }() + + var ( + counter int + prevKey, prevKeyEncoded, prevValEncoded []byte + prevVersionDecoded int64 + prevStore string + ) + + for itr.First(); itr.Valid(); { + currKeyEncoded := slices.Clone(itr.Key()) + + // Ignore metadata entries during pruning + if isMetadataKey(currKeyEncoded) { + itr.Next() + continue + } + + // Store current key and version + currKey, currVersion, currOK := SplitMVCCKey(currKeyEncoded) + if !currOK { + return fmt.Errorf("invalid MVCC key") + } + + storeKey, err := parseStoreKey(currKey) + if err != nil { + // XXX: This should never happen given we skip the metadata keys. + return err + } + + // For every new module visited, check to see last time it was updated + if storeKey != prevStore { + prevStore = storeKey + updated, ok := db.storeKeyDirty.Load(storeKey) + versionUpdated, typeOk := updated.(int64) + // Skip a store's keys if version it was last updated is less than last prune height + if !ok || (typeOk && versionUpdated < db.GetEarliestVersion()) { + itr.SeekGE(storePrefix(storeKey + "0")) + continue + } + } + + currVersionDecoded, err := decodeUint64Ascending(currVersion) + if err != nil { + return err + } + + // Seek to next key if we are at a version which is higher than prune height + // Do not seek to next key if KeepLastVersion is false and we need to delete the previous key in pruning + if currVersionDecoded > version && (db.config.KeepLastVersion || prevVersionDecoded > version) { + itr.NextPrefix() + continue + } + + // Delete a key if another entry for that key exists at a larger version than original but leq to the prune height + // Also delete a key if it has been tombstoned and its version is leq to the prune height + // Also delete a key if KeepLastVersion is false and version is leq to the prune height + if prevVersionDecoded <= version && (bytes.Equal(prevKey, currKey) || valTombstoned(prevValEncoded) || !db.config.KeepLastVersion) { + err = batch.Delete(prevKeyEncoded, nil) + if err != nil { + return err + } + + counter++ + if counter >= PruneCommitBatchSize { + err = batch.Commit(defaultWriteOpts) + if err != nil { + return err + } + + counter = 0 + batch.Reset() + } + } + + // Update prevKey and prevVersion for next iteration + prevKey = currKey + prevVersionDecoded = currVersionDecoded + prevKeyEncoded = currKeyEncoded + prevValEncoded = slices.Clone(itr.Value()) + + itr.Next() + } + + // Commit any leftover delete ops in batch + if counter > 0 { + err = batch.Commit(defaultWriteOpts) + if err != nil { + return err + } + } + + return db.SetEarliestVersion(earliestVersion, false) +} + +func (db *Database) iteratorAscending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { + return nil, errorutils.ErrKeyEmpty + } + + if start != nil && end != nil && bytes.Compare(start, end) > 0 { + return nil, errorutils.ErrStartAfterEnd + } + + lowerBound := MVCCEncodeAscending(prependStoreKey(storeKey, start), 0) + + var upperBound []byte + if end != nil { + upperBound = MVCCEncodeAscending(prependStoreKey(storeKey, end), 0) + } + + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + + return newAscendingIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil +} + +func (db *Database) reverseIteratorAscending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { + return nil, errorutils.ErrKeyEmpty + } + + if start != nil && end != nil && bytes.Compare(start, end) > 0 { + return nil, errorutils.ErrStartAfterEnd + } + + lowerBound := MVCCEncodeAscending(prependStoreKey(storeKey, start), 0) + + var upperBound []byte + if end != nil { + upperBound = MVCCEncodeAscending(prependStoreKey(storeKey, end), 0) + } else { + upperBound = MVCCEncodeAscending(prefixEnd(storePrefix(storeKey)), 0) + } + + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + + return newAscendingIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil +} + +func getMVCCSliceAscending(db *pebble.DB, storeKey string, key []byte, version int64) ([]byte, error) { + // end domain is exclusive, so we need to increment the version by 1 + if version < math.MaxInt64 { + version++ + } + + itr, err := db.NewIter(&pebble.IterOptions{ + LowerBound: MVCCEncodeAscending(prependStoreKey(storeKey, key), 0), + UpperBound: MVCCEncodeAscending(prependStoreKey(storeKey, key), version), + }) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + defer func() { + err = errorutils.Join(err, itr.Close()) + }() + + if !itr.Last() { + return nil, errorutils.ErrRecordNotFound + } + + _, vBz, ok := SplitMVCCKey(itr.Key()) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", itr.Key()) + } + + keyVersion, err := decodeUint64Ascending(vBz) + if err != nil { + return nil, fmt.Errorf("failed to decode key version: %w", err) + } + if keyVersion > version { + return nil, fmt.Errorf("key version too large: %d", keyVersion) + } + + return slices.Clone(itr.Value()), nil +} diff --git a/sei-db/db_engine/pebbledb/mvcc/db_descending.go b/sei-db/db_engine/pebbledb/mvcc/db_descending.go new file mode 100644 index 0000000000..ad80bf2a2d --- /dev/null +++ b/sei-db/db_engine/pebbledb/mvcc/db_descending.go @@ -0,0 +1,323 @@ +package mvcc + +import ( + "bytes" + "context" + "errors" + "fmt" + "time" + + "github.com/cockroachdb/pebble/v2" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "golang.org/x/exp/slices" + + errorutils "github.com/sei-protocol/sei-chain/sei-db/common/errors" + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" +) + +// This file contains the descending-version MVCC implementation used by DBs +// created by this build. It is the fast path: versions of a logical key sort +// newest-first on disk, so Pebble's First() / SeekGE lands directly on the +// latest visible version without iterating older ones. +// +// Callers go through the dispatchers in db.go; nothing here should be invoked +// directly by code outside the package. + +func (db *Database) hasDescending(storeKey string, version int64, key []byte) (bool, error) { + if version < db.GetEarliestVersion() { + return false, nil + } + + val, err := db.getDescending(storeKey, version, key) + if err != nil { + return false, err + } + + return val != nil, nil +} + +func (db *Database) getDescending(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { + startTime := time.Now() + defer func() { + otelMetrics.getLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + attribute.String("store", storeKey), + ), + ) + }() + if targetVersion < db.GetEarliestVersion() { + return nil, nil + } + + prefixedVal, err := getMVCCSliceDescending(db.storage, storeKey, key, targetVersion) + if err != nil { + if errors.Is(err, errorutils.ErrRecordNotFound) { + return nil, nil + } + + return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) + } + + return visibleValueAtVersionDescending(prefixedVal, targetVersion) +} + +// pruneDescending attempts to prune all versions up to and including the current version +// Get the range of keys, manually iterate over them and delete them +// We add a heuristic to skip over a module's keys during pruning if it hasn't been updated +// since the last time pruning occurred. +// NOTE: There is a rare case when a module's keys are skipped during pruning even though +// it has been updated. This occurs when that module's keys are updated in between pruning runs, the node after is restarted. +// This is not a large issue given the next time that module is updated, it will be properly pruned thereafter. +func (db *Database) pruneDescending(version int64) (_err error) { + // Defensive check: ensure database is not closed + if db.storage == nil { + return errors.New("pebbledb: database is closed") + } + + startTime := time.Now() + defer func() { + otelMetrics.pruneLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + ), + ) + }() + + earliestVersion := version + 1 // we increment by 1 to include the provided version + + itr, err := db.storage.NewIter(nil) + if err != nil { + return err + } + defer func() { _ = itr.Close() }() + + batch := db.storage.NewBatch() + defer func() { _ = batch.Close() }() + + var ( + counter int + prevKey []byte + keptBelowPrune bool + prevStore string + ) + + for itr.First(); itr.Valid(); { + currKeyEncoded := slices.Clone(itr.Key()) + + // Ignore metadata entries during pruning + if isMetadataKey(currKeyEncoded) { + itr.Next() + continue + } + + // Store current key and version + currKey, currVersion, currOK := SplitMVCCKey(currKeyEncoded) + if !currOK { + return fmt.Errorf("invalid MVCC key") + } + + storeKey, err := parseStoreKey(currKey) + if err != nil { + // XXX: This should never happen given we skip the metadata keys. + return err + } + + // For every new module visited, check to see last time it was updated + if storeKey != prevStore { + prevStore = storeKey + updated, ok := db.storeKeyDirty.Load(storeKey) + versionUpdated, typeOk := updated.(int64) + // Skip a store's keys if version it was last updated is less than last prune height + if !ok || (typeOk && versionUpdated < db.GetEarliestVersion()) { + itr.SeekGE(storePrefix(storeKey + "0")) + continue + } + } + + currVersionDecoded, err := decodeUint64Descending(currVersion) + if err != nil { + return err + } + + // Reset per-logical-key state when the logical key changes. + if !bytes.Equal(prevKey, currKey) { + prevKey = slices.Clone(currKey) + keptBelowPrune = false + + // Fast path: under descending encoding, versions of a key are stored + // newest-first. When the newest real version is above the prune + // height, seek directly to the first version <= prune height for + // this key instead of iterating through every above-prune version. + if currVersionDecoded > version { + itr.SeekGE(MVCCEncodeDescending(currKey, version)) + continue + } + } + + // Descending iteration: for a given logical key we see newest→oldest. + // Versions > prune height are always kept. For versions <= prune + // height, keep only the newest one when KeepLastVersion is true; + // delete every other such version. + if currVersionDecoded <= version { + if db.config.KeepLastVersion && !keptBelowPrune { + keptBelowPrune = true + } else { + if err := batch.Delete(currKeyEncoded, nil); err != nil { + return err + } + counter++ + if counter >= PruneCommitBatchSize { + if err := batch.Commit(defaultWriteOpts); err != nil { + return err + } + counter = 0 + batch.Reset() + } + } + } + + itr.Next() + } + + // Commit any leftover delete ops in batch + if counter > 0 { + err = batch.Commit(defaultWriteOpts) + if err != nil { + return err + } + } + + return db.SetEarliestVersion(earliestVersion, false) +} + +func (db *Database) iteratorDescending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { + return nil, errorutils.ErrKeyEmpty + } + + if start != nil && end != nil && bytes.Compare(start, end) > 0 { + return nil, errorutils.ErrStartAfterEnd + } + + lowerBound := MVCCEncodeDescending(prependStoreKey(storeKey, start), 0) + + var upperBound []byte + if end != nil { + upperBound = MVCCEncodeDescending(prependStoreKey(storeKey, end), 0) + } else { + upperBound = iteratorUpperBoundForStoreDescending(storeKey) + } + + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil +} + +func (db *Database) reverseIteratorDescending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { + return nil, errorutils.ErrKeyEmpty + } + + if start != nil && end != nil && bytes.Compare(start, end) > 0 { + return nil, errorutils.ErrStartAfterEnd + } + + lowerBound := MVCCEncodeDescending(prependStoreKey(storeKey, start), 0) + + var upperBound []byte + if end != nil { + upperBound = MVCCEncodeDescending(prependStoreKey(storeKey, end), 0) + } else { + upperBound = MVCCEncodeDescending(prefixEnd(storePrefix(storeKey)), 0) + } + + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil +} + +func getMVCCSliceDescending(db *pebble.DB, storeKey string, key []byte, version int64) (_ []byte, err error) { + prefixedKey := prependStoreKey(storeKey, key) + itr, err := db.NewIter(&pebble.IterOptions{ + LowerBound: MVCCEncodeDescending(prefixedKey, version), + UpperBound: iteratorUpperBoundForLogicalKeyDescending(prefixedKey), + }) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + defer func() { + err = errorutils.Join(err, itr.Close()) + }() + + if !itr.First() { + return nil, errorutils.ErrRecordNotFound + } + return decodeMVCCEntryDescending(itr.Key(), itr.Value(), prefixedKey, version) +} + +// decodeMVCCEntryDescending validates that the iterator's current entry +// belongs to prefixedKey at a version <= target and returns a safe copy of the +// value. Assumes descending version encoding. +func decodeMVCCEntryDescending(rawIterKey, rawIterValue, prefixedKey []byte, version int64) ([]byte, error) { + userKey, vBz, ok := SplitMVCCKey(rawIterKey) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) + } + if !bytes.Equal(userKey, prefixedKey) { + return nil, errorutils.ErrRecordNotFound + } + keyVersion, err := decodeUint64Descending(vBz) + if err != nil { + return nil, fmt.Errorf("failed to decode key version: %w", err) + } + if keyVersion > version { + return nil, errorutils.ErrRecordNotFound + } + return slices.Clone(rawIterValue), nil +} + +func visibleValueAtVersionDescending(prefixedVal []byte, targetVersion int64) ([]byte, error) { + valBz, tombBz, ok := SplitMVCCKey(prefixedVal) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) + } + if len(tombBz) == 0 { + return valBz, nil + } + tombstone, err := decodeUint64Descending(tombBz) + if err != nil { + return nil, fmt.Errorf("failed to decode value tombstone: %w", err) + } + if targetVersion < tombstone { + return valBz, nil + } + return nil, nil +} + +func iteratorUpperBoundForStoreDescending(storeKey string) []byte { + upperStorePrefix := prefixEnd(storePrefix(storeKey)) + if upperStorePrefix == nil { + return nil + } + return MVCCEncodeDescending(upperStorePrefix, 0) +} + +func iteratorUpperBoundForLogicalKeyDescending(key []byte) []byte { + upperKeyPrefix := prefixEnd(key) + if upperKeyPrefix == nil { + return nil + } + return MVCCEncodeDescending(upperKeyPrefix, 0) +} diff --git a/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go b/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go index 24a3b9290c..b1d2f07910 100644 --- a/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go +++ b/sei-db/db_engine/pebbledb/mvcc/descending_marker_test.go @@ -1,17 +1,18 @@ package mvcc import ( - "encoding/binary" "testing" "github.com/cockroachdb/pebble/v2" "github.com/stretchr/testify/require" "github.com/sei-protocol/sei-chain/sei-db/config" + "github.com/sei-protocol/sei-chain/sei-db/proto" ) // TestDescendingMVCCMarker_FreshDBWritesMarker verifies that opening an empty -// pebbledb writes the descending-MVCC sentinel so subsequent opens fast-path. +// pebbledb writes the descending-MVCC sentinel so subsequent opens fast-path, +// and that the DB opens in descending mode. func TestDescendingMVCCMarker_FreshDBWritesMarker(t *testing.T) { dir := t.TempDir() cfg := config.DefaultStateStoreConfig() @@ -19,6 +20,8 @@ func TestDescendingMVCCMarker_FreshDBWritesMarker(t *testing.T) { store, err := OpenDB(dir, cfg) require.NoError(t, err) + db := store.(*Database) + require.True(t, db.descending, "fresh DB must open in descending mode") require.NoError(t, store.Close()) // Reopen the raw pebble DB and check the sentinel is there. @@ -32,29 +35,69 @@ func TestDescendingMVCCMarker_FreshDBWritesMarker(t *testing.T) { require.NoError(t, closer.Close()) } -// TestDescendingMVCCMarker_LegacyDBRejected simulates a DB written by the old -// ascending-version build (latestVersionKey present, no marker) and asserts we -// refuse to open it rather than silently returning wrong versions. -func TestDescendingMVCCMarker_LegacyDBRejected(t *testing.T) { +// TestDescendingMVCCMarker_LegacyDBOpensInAscendingMode simulates a DB +// written by the legacy ascending-version build and asserts OpenDB does NOT +// error, instead returning a Database that operates in ascending mode. It +// then performs a write + read round-trip to confirm correctness, and +// verifies no descending marker was written to the legacy DB. +func TestDescendingMVCCMarker_LegacyDBOpensInAscendingMode(t *testing.T) { dir := t.TempDir() - raw, err := pebble.Open(dir, &pebble.Options{Comparer: MVCCComparer}) - require.NoError(t, err) - var ts [VersionSize]byte - binary.LittleEndian.PutUint64(ts[:], uint64(42)) - require.NoError(t, raw.Set([]byte(latestVersionKey), ts[:], pebble.Sync)) - require.NoError(t, raw.Close()) + // Seed the directory with a legacy-style DB: some ascending-encoded data + // plus a latestVersionKey, but no descending marker. + { + raw, err := pebble.Open(dir, &pebble.Options{Comparer: MVCCComparer}) + require.NoError(t, err) + // Write a value for ("store1", "k") at version 1 using the ascending + // encoding, matching what a legacy build would have persisted. + prefixedKey := MVCCEncodeAscending(prependStoreKey("store1", []byte("k")), 1) + prefixedVal := MVCCEncodeAscending([]byte("v1"), 0) + require.NoError(t, raw.Set(prefixedKey, prefixedVal, pebble.Sync)) + // Set latestVersionKey so detectMVCCMode sees a populated DB. + var ts [VersionSize]byte + ts[0] = 1 + require.NoError(t, raw.Set([]byte(latestVersionKey), ts[:], pebble.Sync)) + require.NoError(t, raw.Close()) + } cfg := config.DefaultStateStoreConfig() cfg.Backend = "pebbledb" - _, err = OpenDB(dir, cfg) - require.Error(t, err) - require.Contains(t, err.Error(), "state sync required") + store, err := OpenDB(dir, cfg) + require.NoError(t, err, "legacy DB must open without error") + db := store.(*Database) + require.False(t, db.descending, "legacy DB must open in ascending mode") + + // Pre-existing value is readable via the ascending path. + got, err := db.Get("store1", 1, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("v1"), got) + + // Writes land in ascending encoding and round-trip correctly. + require.NoError(t, db.ApplyChangesetSync(2, []*proto.NamedChangeSet{{ + Name: "store1", + Changeset: proto.ChangeSet{Pairs: []*proto.KVPair{{Key: []byte("k"), Value: []byte("v2")}}}, + }})) + got, err = db.Get("store1", 2, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("v2"), got) + + require.NoError(t, store.Close()) + + // Confirm we did NOT stamp the descending marker on a legacy DB. + raw, err := pebble.Open(dir, &pebble.Options{Comparer: MVCCComparer}) + require.NoError(t, err) + defer func() { _ = raw.Close() }() + + _, closer, err := raw.Get([]byte(descendingMVCCMarkerKey)) + require.ErrorIs(t, err, pebble.ErrNotFound, "legacy DB must stay unmarked") + if closer != nil { + _ = closer.Close() + } } // TestDescendingMVCCMarker_RoundTrip writes data with OpenDB, reopens, and -// confirms the second open succeeds (marker is honored, no false rejection). +// confirms the second open succeeds in descending mode (marker is honored). func TestDescendingMVCCMarker_RoundTrip(t *testing.T) { dir := t.TempDir() cfg := config.DefaultStateStoreConfig() @@ -63,10 +106,13 @@ func TestDescendingMVCCMarker_RoundTrip(t *testing.T) { store, err := OpenDB(dir, cfg) require.NoError(t, err) db := store.(*Database) + require.True(t, db.descending) applyVersion(t, db, "store1", 1, []byte("k"), []byte("v")) require.NoError(t, db.Close()) store2, err := OpenDB(dir, cfg) require.NoError(t, err) + db2 := store2.(*Database) + require.True(t, db2.descending, "marked DB must reopen in descending mode") require.NoError(t, store2.Close()) } diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator_ascending.go b/sei-db/db_engine/pebbledb/mvcc/iterator_ascending.go new file mode 100644 index 0000000000..49bd1dcfa6 --- /dev/null +++ b/sei-db/db_engine/pebbledb/mvcc/iterator_ascending.go @@ -0,0 +1,398 @@ +package mvcc + +import ( + "bytes" + "context" + "fmt" + "sync" + + "github.com/cockroachdb/pebble/v2" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "golang.org/x/exp/slices" + + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" +) + +// This file contains the ascending-version MVCC iterator used for legacy DBs +// that were written by the pre-descending build. It is a verbatim port of the +// iterator implementation from main and is intentionally kept isolated from +// the descending fast-path iterator to avoid subtle interactions between the +// two encoding schemes. +// +// Archive nodes that cannot migrate will continue to use this path. + +var _ types.DBIterator = (*ascendingIterator)(nil) + +// ascendingIterator is the legacy iterator. Versions of a logical key sort +// oldest-first on disk, so finding the visible version for a target height +// requires a SeekLT(version+1) dance rather than a cheap First(). +type ascendingIterator struct { + source *pebble.Iterator + prefix, start, end []byte + version int64 + valid bool + reverse bool + iterationCount int64 + storeKey string + + closeSync sync.Once +} + +func newAscendingIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version int64, earliestVersion int64, reverse bool, storeKey string) *ascendingIterator { + // Return invalid iterator if requested iterator height is lower than earliest version after pruning + if version < earliestVersion { + return &ascendingIterator{ + source: src, + prefix: prefix, + start: mvccStart, + end: mvccEnd, + version: version, + valid: false, + reverse: reverse, + storeKey: storeKey, + } + } + + // move the underlying PebbleDB iterator to the first key + var valid bool + if reverse { + valid = src.Last() + } else { + valid = src.First() + } + + itr := &ascendingIterator{ + source: src, + prefix: prefix, + start: mvccStart, + end: mvccEnd, + version: version, + valid: valid, + reverse: reverse, + storeKey: storeKey, + } + + if valid { + currKey, currKeyVersion, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed MVCC key. + panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) + } + + curKeyVersionDecoded, err := decodeUint64Ascending(currKeyVersion) + if err != nil { + itr.valid = false + return itr + } + + // We need to check whether initial key iterator visits has a version <= requested version + // If larger version, call next to find another key which does + if curKeyVersionDecoded > itr.version { + itr.Next() + } else { + // If version is less, seek to the largest version of that key <= requested iterator version + // It is guaranteed this won't move the iterator to a key that is invalid since + // curKeyVersionDecoded <= requested iterator version, so there exists at least one version of currKey SeekLT may move to + itr.valid = itr.source.SeekLT(MVCCEncodeAscending(currKey, itr.version+1)) + } + } + + // Make sure we skip to the next key if the current one is tombstone + // Only check if iterator is still valid after the seek/next operations above + if itr.valid && valTombstoned(itr.source.Value()) { + if reverse { + itr.nextReverse() + } else { + itr.nextForward() + } + } + + return itr +} + +// Domain returns the domain of the iterator. The caller must not modify the +// return values. +func (itr *ascendingIterator) Domain() ([]byte, []byte) { + return itr.start, itr.end +} + +func (itr *ascendingIterator) Key() []byte { + itr.assertIsValid() + + key, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) + } + + keyCopy := slices.Clone(key) + return keyCopy[len(itr.prefix):] +} + +func (itr *ascendingIterator) Value() []byte { + itr.assertIsValid() + + val, _, ok := SplitMVCCKey(itr.source.Value()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC value. + panic(fmt.Sprintf("invalid PebbleDB MVCC value: %s", itr.source.Key())) + } + + return slices.Clone(val) +} + +func (itr *ascendingIterator) nextForward() { + if !itr.source.Valid() { + itr.valid = false + return + } + + currKey, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) + } + + next := itr.source.NextPrefix() + + // First move the iterator to the next prefix, which may not correspond to the + // desired version for that key, e.g. if the key was written at a later version, + // so we seek back to the latest desired version, s.t. the version is <= itr.version. + if next { + nextKey, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + itr.valid = false + return + } + if !bytes.HasPrefix(nextKey, itr.prefix) { + // the next key must have itr.prefix as the prefix + itr.valid = false + return + } + + // Move the iterator to the closest version to the desired version, so we + // append the current iterator key to the prefix and seek to that key. + itr.valid = itr.source.SeekLT(MVCCEncodeAscending(nextKey, itr.version+1)) + + tmpKey, tmpKeyVersion, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + itr.valid = false + return + } + + // There exists cases where the SeekLT() call moved us back to the same key + // we started at, so we must move to next key, i.e. two keys forward. + if bytes.Equal(tmpKey, currKey) { + if itr.source.NextPrefix() { + itr.nextForward() + + _, tmpKeyVersion, ok = SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + itr.valid = false + return + } + + } else { + itr.valid = false + return + } + } + + // We need to verify that every Next call either moves the iterator to a key whose version + // is less than or equal to requested iterator version, or exhausts the iterator + tmpKeyVersionDecoded, err := decodeUint64Ascending(tmpKeyVersion) + if err != nil { + itr.valid = false + return + } + + // If iterator is at a entry whose version is higher than requested version, call nextForward again + if tmpKeyVersionDecoded > itr.version { + itr.nextForward() + } + + // The cursor might now be pointing at a key/value pair that is tombstoned. + // If so, we must move the cursor. + if itr.valid && itr.cursorTombstoned() { + itr.nextForward() + } + + return + } + + itr.valid = false +} + +func (itr *ascendingIterator) nextReverse() { + if !itr.source.Valid() { + itr.valid = false + return + } + + currKey, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key())) + } + + next := itr.source.SeekLT(MVCCEncodeAscending(currKey, 0)) + + // First move the iterator to the next prefix, which may not correspond to the + // desired version for that key, e.g. if the key was written at a later version, + // so we seek back to the latest desired version, s.t. the version is <= itr.version. + if next { + nextKey, _, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + itr.valid = false + return + } + if !bytes.HasPrefix(nextKey, itr.prefix) { + // the next key must have itr.prefix as the prefix + itr.valid = false + return + } + + // Move the iterator to the closest version to the desired version, so we + // append the current iterator key to the prefix and seek to that key. + itr.valid = itr.source.SeekLT(MVCCEncodeAscending(nextKey, itr.version+1)) + + _, tmpKeyVersion, ok := SplitMVCCKey(itr.source.Key()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC key. + itr.valid = false + return + } + + // We need to verify that every Next call either moves the iterator to a key whose version + // is less than or equal to requested iterator version, or exhausts the iterator + tmpKeyVersionDecoded, err := decodeUint64Ascending(tmpKeyVersion) + if err != nil { + itr.valid = false + return + } + + // If iterator is at a entry whose version is higher than requested version, call nextReverse again + if tmpKeyVersionDecoded > itr.version { + itr.nextReverse() + } + + // The cursor might now be pointing at a key/value pair that is tombstoned. + // If so, we must move the cursor. + if itr.valid && itr.cursorTombstoned() { + itr.nextReverse() + } + + return + } + + itr.valid = false +} + +func (itr *ascendingIterator) Next() { + itr.iterationCount++ + + if itr.reverse { + itr.nextReverse() + } else { + itr.nextForward() + } +} + +func (itr *ascendingIterator) Valid() bool { + // once invalid, forever invalid + if !itr.valid || !itr.source.Valid() { + itr.valid = false + return itr.valid + } + + // if source has error, consider it invalid + if err := itr.source.Error(); err != nil { + itr.valid = false + return itr.valid + } + + // if key is at the end or past it, consider it invalid + if end := itr.end; end != nil { + if bytes.Compare(end, itr.Key()) <= 0 { + itr.valid = false + return itr.valid + } + } + + return true +} + +func (itr *ascendingIterator) Error() error { + return itr.source.Error() +} + +func (itr *ascendingIterator) Close() error { + itr.closeSync.Do(func() { + _ = itr.source.Close() + itr.source = nil + itr.valid = false + + // Record the number of iterations performed by this iterator + otelMetrics.iteratorIterations.Record( + context.Background(), + float64(itr.iterationCount), + metric.WithAttributes( + attribute.Bool("reverse", itr.reverse), + attribute.String("store", itr.storeKey), + ), + ) + }) + return nil +} + +func (itr *ascendingIterator) assertIsValid() { + if !itr.valid { + panic("iterator is invalid") + } +} + +// cursorTombstoned checks if the current cursor is pointing at a key/value pair +// that is tombstoned. If the cursor is tombstoned, is returned, otherwise +// is returned. In the case where the iterator is valid but the key/value +// pair is tombstoned, the caller should call Next(). Note, this method assumes +// the caller assures the iterator is valid first! +func (itr *ascendingIterator) cursorTombstoned() bool { + _, tombBz, ok := SplitMVCCKey(itr.source.Value()) + if !ok { + // XXX: This should not happen as that would indicate we have a malformed + // MVCC value. + panic(fmt.Sprintf("invalid PebbleDB MVCC value: %s", itr.source.Key())) + } + + // If the tombstone suffix is empty, we consider this a zero value and thus it + // is not tombstoned. + if len(tombBz) == 0 { + return false + } + + // If the tombstone suffix is non-empty and greater than the target version, + // the value is not tombstoned. + tombstone, err := decodeUint64Ascending(tombBz) + if err != nil { + panic(fmt.Errorf("failed to decode value tombstone: %w", err)) + } + if tombstone > itr.version { + return false + } + + return true +} diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator.go b/sei-db/db_engine/pebbledb/mvcc/iterator_descending.go similarity index 98% rename from sei-db/db_engine/pebbledb/mvcc/iterator.go rename to sei-db/db_engine/pebbledb/mvcc/iterator_descending.go index 60d4d47f25..cd75f63fe9 100644 --- a/sei-db/db_engine/pebbledb/mvcc/iterator.go +++ b/sei-db/db_engine/pebbledb/mvcc/iterator_descending.go @@ -96,7 +96,7 @@ func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte } func (itr *iterator) seekVisibleVersionForKey(targetKey []byte) bool { - seekKey := MVCCEncode(targetKey, itr.version) + seekKey := MVCCEncodeDescending(targetKey, itr.version) valid := itr.source.SeekGE(seekKey) if !valid { return false @@ -121,7 +121,7 @@ func (itr *iterator) nextLogicalKey(currKey []byte) ([]byte, bool) { if nextKeyPrefix == nil { return nil, false } - seekKey := MVCCEncode(nextKeyPrefix, math.MaxInt64) + seekKey := MVCCEncodeDescending(nextKeyPrefix, math.MaxInt64) valid := itr.source.SeekGE(seekKey) if !valid { return nil, false @@ -134,7 +134,7 @@ func (itr *iterator) nextLogicalKey(currKey []byte) ([]byte, bool) { } func (itr *iterator) prevLogicalKey(currKey []byte) ([]byte, bool) { - seekKey := MVCCEncode(currKey, math.MaxInt64) + seekKey := MVCCEncodeDescending(currKey, math.MaxInt64) valid := itr.source.SeekLT(seekKey) if !valid { return nil, false diff --git a/sei-db/db_engine/pebbledb/mvcc/prune_test.go b/sei-db/db_engine/pebbledb/mvcc/prune_test.go index 909a2666be..6334760534 100644 --- a/sei-db/db_engine/pebbledb/mvcc/prune_test.go +++ b/sei-db/db_engine/pebbledb/mvcc/prune_test.go @@ -16,8 +16,8 @@ import ( func rawVersionsForKey(t *testing.T, db *Database, store string, key []byte) []int64 { t.Helper() prefix := prependStoreKey(store, key) - lower := MVCCEncode(prefix, 0) - upper := MVCCEncode(append(append([]byte{}, prefix...), 0x01), 0) + lower := MVCCEncodeDescending(prefix, 0) + upper := MVCCEncodeDescending(append(append([]byte{}, prefix...), 0x01), 0) itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lower, UpperBound: upper}) require.NoError(t, err) defer func() { _ = itr.Close() }() From 8f080086de2bc6f4ca5f8179e5c6516c259148db Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 13:17:57 -0400 Subject: [PATCH 11/15] chore: inline trivial batch helpers and snapshotReplayState Drop three ornamental wrappers flagged by review: - Batch.appendSet / Batch.appendDelete / RawBatch.appendSet were each five-line clone-and-append helpers called 1-3 times; inline at the call sites. - NewBatch / NewRawBatch were zero-argument wrappers that always forwarded with descending=true to NewBatchWithMode. Drop the wrappers, rename NewBatchWithMode -> NewBatch and NewRawBatchWithMode -> NewRawBatch so the mode is required at the one kind of call site that exists. - snapshotReplayState was a one-liner ctx.WithMultiStore(ctx.MultiStore() .CacheMultiStore()) wrapper with two call sites. Inline it. No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- evmrpc/simulate.go | 8 +--- sei-db/db_engine/pebbledb/mvcc/batch.go | 56 +++++++------------------ sei-db/db_engine/pebbledb/mvcc/db.go | 10 ++--- 3 files changed, 23 insertions(+), 51 deletions(-) diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index 2b09c1bf57..0143efc978 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -518,7 +518,7 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype sdkCtx = sdkCtx.WithMultiStore(cachedCtx.MultiStore().CacheMultiStore()) startIdx = cachedIdx + 1 } else { - b.putReplayState(block.Hash().Hex(), -1, snapshotReplayState(sdkCtx.WithTraceMode(true))) + b.putReplayState(block.Hash().Hex(), -1, sdkCtx.WithTraceMode(true).WithMultiStore(sdkCtx.MultiStore().CacheMultiStore())) } for idx, tx := range tmBlock.Block.Txs { @@ -538,7 +538,7 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype _ = b.app.DeliverTx(sdkCtx, abci.RequestDeliverTxV2{Tx: tx}, sdkTx, sha256.Sum256(tx)) } finalCtx := sdkCtx.WithIsEVM(true) - b.putReplayState(block.Hash().Hex(), txIndex, snapshotReplayState(finalCtx.WithTraceMode(true))) + b.putReplayState(block.Hash().Hex(), txIndex, finalCtx.WithTraceMode(true).WithMultiStore(finalCtx.MultiStore().CacheMultiStore())) return state.NewDBImpl(finalCtx, b.keeper, true), tmBlock.Block.Txs, nil } @@ -579,10 +579,6 @@ func (b *Backend) putReplayState(blockHash string, txIndex int, ctx sdk.Context) state.mu.Unlock() } -func snapshotReplayState(ctx sdk.Context) sdk.Context { - return ctx.WithMultiStore(ctx.MultiStore().CacheMultiStore()) -} - func (b *Backend) StateAtBlock(ctx context.Context, block *ethtypes.Block, reexec uint64, base vm.StateDB, readOnly bool, preferDisk bool) (vm.StateDB, tracers.StateReleaseFunc, error) { emptyRelease := func() {} sdkCtx, _, err := b.initializeBlock(ctx, block) diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index 261c913453..bcb5e3f5d4 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -26,14 +26,8 @@ type batchOp struct { delete bool } -// NewBatch creates a new descending-mode Batch. Callers that need ascending-mode -// encoding for legacy DBs should use NewBatchWithMode. -func NewBatch(storage *pebble.DB, version int64) (*Batch, error) { - return NewBatchWithMode(storage, version, true) -} - -// NewBatchWithMode creates a new Batch using the supplied MVCC encoding mode. -func NewBatchWithMode(storage *pebble.DB, version int64, descending bool) (*Batch, error) { +// NewBatch creates a new Batch using the supplied MVCC encoding mode. +func NewBatch(storage *pebble.DB, version int64, descending bool) (*Batch, error) { if version < 0 { return nil, fmt.Errorf("version must be non-negative") } @@ -58,7 +52,10 @@ func (b *Batch) set(storeKey string, tombstone int64, key, value []byte) error { prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), b.version, b.descending) prefixedVal := MVCCEncode(value, tombstone, b.descending) - b.appendSet(prefixedKey, prefixedVal) + b.ops = append(b.ops, batchOp{ + key: append([]byte(nil), prefixedKey...), + value: append([]byte(nil), prefixedVal...), + }) return nil } @@ -118,14 +115,8 @@ type RawBatch struct { descending bool } -// NewRawBatch creates a new descending-mode RawBatch. -func NewRawBatch(storage *pebble.DB) (*RawBatch, error) { - return NewRawBatchWithMode(storage, true) -} - -// NewRawBatchWithMode creates a new RawBatch using the supplied MVCC encoding -// mode. -func NewRawBatchWithMode(storage *pebble.DB, descending bool) (*RawBatch, error) { +// NewRawBatch creates a new RawBatch using the supplied MVCC encoding mode. +func NewRawBatch(storage *pebble.DB, descending bool) (*RawBatch, error) { return &RawBatch{ storage: storage, ops: make([]batchOp, 0, 16), @@ -145,7 +136,10 @@ func (b *RawBatch) set(storeKey string, tombstone int64, key, value []byte, vers prefixedKey := MVCCEncode(prependStoreKey(storeKey, key), version, b.descending) prefixedVal := MVCCEncode(value, tombstone, b.descending) - b.appendSet(prefixedKey, prefixedVal) + b.ops = append(b.ops, batchOp{ + key: append([]byte(nil), prefixedKey...), + value: append([]byte(nil), prefixedVal...), + }) return nil } @@ -161,7 +155,10 @@ func (b *RawBatch) Delete(storeKey string, key []byte, version int64) error { // and calling the underlying pebble.Batch.Delete. func (b *Batch) HardDelete(storeKey string, key []byte) error { fullKey := MVCCEncode(prependStoreKey(storeKey, key), b.version, b.descending) - b.appendDelete(fullKey) + b.ops = append(b.ops, batchOp{ + key: append([]byte(nil), fullKey...), + delete: true, + }) return nil } @@ -200,27 +197,6 @@ func (b *RawBatch) Write() (err error) { return batch.Commit(defaultWriteOpts) } -func (b *Batch) appendSet(key, value []byte) { - b.ops = append(b.ops, batchOp{ - key: append([]byte(nil), key...), - value: append([]byte(nil), value...), - }) -} - -func (b *Batch) appendDelete(key []byte) { - b.ops = append(b.ops, batchOp{ - key: append([]byte(nil), key...), - delete: true, - }) -} - -func (b *RawBatch) appendSet(key, value []byte) { - b.ops = append(b.ops, batchOp{ - key: append([]byte(nil), key...), - value: append([]byte(nil), value...), - }) -} - func sortBatchOps(ops []batchOp) { sort.SliceStable(ops, func(i, j int) bool { return MVCCComparer.Compare(ops[i].key, ops[j].key) < 0 diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 04882ea33e..9e890ddb6a 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -404,7 +404,7 @@ func (db *Database) ApplyChangesetSync(version int64, changeset []*proto.NamedCh } // Create batch and persist latest version in the batch - b, err := NewBatchWithMode(db.storage, version, db.descending) + b, err := NewBatch(db.storage, version, db.descending) if err != nil { return err } @@ -547,7 +547,7 @@ func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err er worker := func() { defer wg.Done() - batch, err := NewBatchWithMode(db.storage, version, db.descending) + batch, err := NewBatch(db.storage, version, db.descending) if err != nil { panic(err) } @@ -568,7 +568,7 @@ func (db *Database) Import(version int64, ch <-chan types.SnapshotNode) (_err er panic(err) } - batch, err = NewBatchWithMode(db.storage, version, db.descending) + batch, err = NewBatch(db.storage, version, db.descending) if err != nil { panic(err) } @@ -653,7 +653,7 @@ func (db *Database) RawIterate(storeKey string, fn func(key []byte, value []byte func (db *Database) DeleteKeysAtVersion(module string, version int64) error { - batch, err := NewBatchWithMode(db.storage, version, db.descending) + batch, err := NewBatch(db.storage, version, db.descending) if err != nil { return fmt.Errorf("failed to create deletion batch for module %q: %w", module, err) } @@ -673,7 +673,7 @@ func (db *Database) DeleteKeysAtVersion(module string, version int64) error { return true } deleteCounter = 0 - batch, err = NewBatchWithMode(db.storage, version, db.descending) + batch, err = NewBatch(db.storage, version, db.descending) if err != nil { fmt.Printf("Error creating a new deletion batch for module %q: %v\n", module, err) return true From eb1baf54b39a433489d92cb30ff83280aa57333d Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 13:47:32 -0400 Subject: [PATCH 12/15] chore: remove cross-call replay-state cache from Backend The block-level replay-state LRU only paid off when trace requests clustered on the same block in quick succession. evmrpc handles one call at a time and can't observe whether callers are clustering; workload-aware prefix reuse belongs above this layer (in a block explorer, indexer, or tracing harness that actually knows its access pattern). Baking it into the RPC added stateful fields on Backend, an LRU size / TTL we picked without data, and a double-lock put path that existed only to serve a workload the RPC can't verify. Drop the cache entirely: - blockReplayState, replayStateCache, replayStateCacheMu, replayStateCacheBlocks, replayStateCacheTTL from evmrpc/simulate.go - getReplayState, putReplayState methods - the cache check + two putReplayState calls in ReplayTransactionTillIndex; unconditionally replay from tx 0 - evmrpc/simulate_cache_test.go (no longer meaningful) - the golang-lru/v2/expirable import (still used elsewhere in evmrpc) Per-call perf is unaffected: the descending-version MVCC encoding still makes each replayed read cheap, which is the right layer for a per-call optimization. Co-Authored-By: Claude Opus 4.7 (1M context) --- evmrpc/simulate.go | 74 +--------------------------- evmrpc/simulate_cache_test.go | 93 ----------------------------------- 2 files changed, 1 insertion(+), 166 deletions(-) delete mode 100644 evmrpc/simulate_cache_test.go diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index 0143efc978..4f0267ae0f 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -28,7 +28,6 @@ import ( "github.com/ethereum/go-ethereum/export" "github.com/ethereum/go-ethereum/params" "github.com/ethereum/go-ethereum/rpc" - "github.com/hashicorp/golang-lru/v2/expirable" "github.com/sei-protocol/sei-chain/app/legacyabci" "github.com/sei-protocol/sei-chain/precompiles/wasmd" "github.com/sei-protocol/sei-chain/sei-cosmos/baseapp" @@ -231,23 +230,8 @@ type Backend struct { globalBlockCache BlockCache cacheCreationMutex *sync.Mutex watermarks *WatermarkManager - replayStateCacheMu *sync.Mutex - replayStateCache *expirable.LRU[string, *blockReplayState] } -// blockReplayState holds cached replay checkpoints for a single block, keyed -// by tx index. Protected by its own mutex so entries for different blocks -// can be updated independently. -type blockReplayState struct { - mu sync.Mutex - checkpoints map[int]sdk.Context -} - -const ( - replayStateCacheBlocks = 32 - replayStateCacheTTL = 10 * time.Minute -) - func NewBackend( ctxProvider func(int64) sdk.Context, keeper *keeper.Keeper, @@ -273,10 +257,6 @@ func NewBackend( globalBlockCache: globalBlockCache, cacheCreationMutex: cacheCreationMutex, watermarks: watermarks, - replayStateCacheMu: &sync.Mutex{}, - replayStateCache: expirable.NewLRU[string, *blockReplayState]( - replayStateCacheBlocks, nil, replayStateCacheTTL, - ), } } @@ -511,20 +491,7 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype return state.NewDBImpl(sdkCtx.WithIsEVM(true), b.keeper, true), tmBlock.Block.Txs, nil } - startIdx := 0 - if cachedCtx, cachedIdx, ok := b.getReplayState(block.Hash().Hex(), txIndex); ok { - // Always replay from a fresh branch of the cached checkpoint so the - // stored snapshot remains immutable across requests. - sdkCtx = sdkCtx.WithMultiStore(cachedCtx.MultiStore().CacheMultiStore()) - startIdx = cachedIdx + 1 - } else { - b.putReplayState(block.Hash().Hex(), -1, sdkCtx.WithTraceMode(true).WithMultiStore(sdkCtx.MultiStore().CacheMultiStore())) - } - for idx, tx := range tmBlock.Block.Txs { - if idx < startIdx { - continue - } if idx > txIndex { break } @@ -537,46 +504,7 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype } _ = b.app.DeliverTx(sdkCtx, abci.RequestDeliverTxV2{Tx: tx}, sdkTx, sha256.Sum256(tx)) } - finalCtx := sdkCtx.WithIsEVM(true) - b.putReplayState(block.Hash().Hex(), txIndex, finalCtx.WithTraceMode(true).WithMultiStore(finalCtx.MultiStore().CacheMultiStore())) - return state.NewDBImpl(finalCtx, b.keeper, true), tmBlock.Block.Txs, nil -} - -func (b *Backend) getReplayState(blockHash string, txIndex int) (sdk.Context, int, bool) { - state, ok := b.replayStateCache.Get(blockHash) - if !ok { - return sdk.Context{}, 0, false - } - state.mu.Lock() - defer state.mu.Unlock() - bestIdx := math.MinInt - var bestCtx sdk.Context - for idx, ctx := range state.checkpoints { - if idx <= txIndex && idx > bestIdx { - bestIdx = idx - bestCtx = ctx - } - } - if bestIdx == math.MinInt { - return sdk.Context{}, 0, false - } - return bestCtx, bestIdx, true -} - -func (b *Backend) putReplayState(blockHash string, txIndex int, ctx sdk.Context) { - state, ok := b.replayStateCache.Get(blockHash) - if !ok { - b.replayStateCacheMu.Lock() - state, ok = b.replayStateCache.Get(blockHash) - if !ok { - state = &blockReplayState{checkpoints: map[int]sdk.Context{}} - b.replayStateCache.Add(blockHash, state) - } - b.replayStateCacheMu.Unlock() - } - state.mu.Lock() - state.checkpoints[txIndex] = ctx - state.mu.Unlock() + return state.NewDBImpl(sdkCtx.WithIsEVM(true), b.keeper, true), tmBlock.Block.Txs, nil } func (b *Backend) StateAtBlock(ctx context.Context, block *ethtypes.Block, reexec uint64, base vm.StateDB, readOnly bool, preferDisk bool) (vm.StateDB, tracers.StateReleaseFunc, error) { diff --git a/evmrpc/simulate_cache_test.go b/evmrpc/simulate_cache_test.go deleted file mode 100644 index f9bca37107..0000000000 --- a/evmrpc/simulate_cache_test.go +++ /dev/null @@ -1,93 +0,0 @@ -package evmrpc - -import ( - "fmt" - "sync" - "testing" - "time" - - "github.com/hashicorp/golang-lru/v2/expirable" - "github.com/stretchr/testify/require" - - sdk "github.com/sei-protocol/sei-chain/sei-cosmos/types" -) - -func newTestBackend(size int, ttl time.Duration) *Backend { - return &Backend{ - replayStateCacheMu: &sync.Mutex{}, - replayStateCache: expirable.NewLRU[string, *blockReplayState](size, nil, ttl), - } -} - -// TestReplayStateCache_GetPut exercises the basic round-trip and the -// "best checkpoint <= txIndex" selection logic used to resume a trace. -func TestReplayStateCache_GetPut(t *testing.T) { - b := newTestBackend(replayStateCacheBlocks, replayStateCacheTTL) - hash := "0xabc" - - ctx0 := sdk.Context{}.WithBlockHeight(100) - ctx5 := sdk.Context{}.WithBlockHeight(105) - ctx10 := sdk.Context{}.WithBlockHeight(110) - - b.putReplayState(hash, -1, ctx0) - b.putReplayState(hash, 5, ctx5) - b.putReplayState(hash, 10, ctx10) - - // Asking for txIndex=7 should return the checkpoint at idx=5. - got, idx, ok := b.getReplayState(hash, 7) - require.True(t, ok) - require.Equal(t, 5, idx) - require.Equal(t, int64(105), got.BlockHeight()) - - // Asking for txIndex=0 should return the -1 checkpoint. - got, idx, ok = b.getReplayState(hash, 0) - require.True(t, ok) - require.Equal(t, -1, idx) - require.Equal(t, int64(100), got.BlockHeight()) - - // Unknown block returns false. - _, _, ok = b.getReplayState("0xmissing", 0) - require.False(t, ok) -} - -// TestReplayStateCache_EvictsOldBlocks is the regression test for the -// unbounded-memory-growth bug: distinct blocks beyond the cache size must -// be evicted, not retained forever. -func TestReplayStateCache_EvictsOldBlocks(t *testing.T) { - const size = 4 - b := newTestBackend(size, time.Hour) - - for i := 0; i < size*3; i++ { - hash := fmt.Sprintf("0x%d", i) - b.putReplayState(hash, 0, sdk.Context{}.WithBlockHeight(int64(i))) - } - - require.LessOrEqual(t, b.replayStateCache.Len(), size, - "cache must not grow beyond its configured size") - - _, _, ok := b.getReplayState("0x0", 0) - require.False(t, ok, "oldest block must have been evicted") - - _, _, ok = b.getReplayState(fmt.Sprintf("0x%d", size*3-1), 0) - require.True(t, ok, "most recently added block must still be cached") -} - -// TestReplayStateCache_Concurrent runs parallel puts/gets to catch -// data races on the per-block inner map. Run with `go test -race`. -func TestReplayStateCache_Concurrent(t *testing.T) { - b := newTestBackend(replayStateCacheBlocks, replayStateCacheTTL) - - var wg sync.WaitGroup - for w := 0; w < 8; w++ { - wg.Add(1) - go func(w int) { - defer wg.Done() - hash := fmt.Sprintf("block-%d", w%4) // 4 distinct blocks, contention on each - for i := 0; i < 100; i++ { - b.putReplayState(hash, i, sdk.Context{}.WithBlockHeight(int64(i))) - _, _, _ = b.getReplayState(hash, i) - } - }(w) - } - wg.Wait() -} From f0ed88c50ab4984bb754cde5428d3d31966e462c Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 13:51:37 -0400 Subject: [PATCH 13/15] refactor: collapse db_descending.go into db.go, rename iterator.go The three-way file split (db.go / db_descending.go / db_ascending.go) implied two equal modes when the intent is one canonical path plus a backstop. Fold the descending implementation back into db.go so the canonical Database lives in one place, and keep db_ascending.go and iterator_ascending.go as the clearly-labeled legacy carveouts. Rename iterator_descending.go -> iterator.go for the same reason: it is the iterator; iterator_ascending.go is the compat variant. File layout after: - db.go - Database, OpenDB, dispatchers, writes, descending mode implementation, shared helpers, metrics - db_ascending.go - legacy ascending-mode carveout for pre-migration DBs - iterator.go - descending iterator (the iterator) - iterator_ascending.go - legacy iterator carveout No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/db_engine/pebbledb/mvcc/db.go | 308 +++++++++++++++++ .../db_engine/pebbledb/mvcc/db_descending.go | 323 ------------------ .../{iterator_descending.go => iterator.go} | 0 3 files changed, 308 insertions(+), 323 deletions(-) delete mode 100644 sei-db/db_engine/pebbledb/mvcc/db_descending.go rename sei-db/db_engine/pebbledb/mvcc/{iterator_descending.go => iterator.go} (100%) diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index 9e890ddb6a..bf4a7fccba 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -17,7 +17,9 @@ import ( "github.com/cockroachdb/pebble/v2/sstable" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" + "golang.org/x/exp/slices" + errorutils "github.com/sei-protocol/sei-chain/sei-db/common/errors" "github.com/sei-protocol/sei-chain/sei-db/common/utils" "github.com/sei-protocol/sei-chain/sei-db/config" "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" @@ -515,6 +517,312 @@ func (db *Database) ReverseIterator(storeKey string, version int64, start, end [ return db.reverseIteratorAscending(storeKey, version, start, end) } +// --------------------------------------------------------------------------- +// Descending-mode implementation (the fast path used by DBs created by this +// build). Versions of a logical key sort newest-first on disk, so Pebble's +// First() / SeekGE lands directly on the latest visible version without +// iterating older ones. The ascending-mode counterparts live in +// db_ascending.go for legacy DBs. +// --------------------------------------------------------------------------- + +func (db *Database) hasDescending(storeKey string, version int64, key []byte) (bool, error) { + if version < db.GetEarliestVersion() { + return false, nil + } + + val, err := db.getDescending(storeKey, version, key) + if err != nil { + return false, err + } + + return val != nil, nil +} + +func (db *Database) getDescending(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { + startTime := time.Now() + defer func() { + otelMetrics.getLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + attribute.String("store", storeKey), + ), + ) + }() + if targetVersion < db.GetEarliestVersion() { + return nil, nil + } + + prefixedVal, err := getMVCCSliceDescending(db.storage, storeKey, key, targetVersion) + if err != nil { + if errors.Is(err, errorutils.ErrRecordNotFound) { + return nil, nil + } + + return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) + } + + return visibleValueAtVersionDescending(prefixedVal, targetVersion) +} + +// pruneDescending attempts to prune all versions up to and including the current version +// Get the range of keys, manually iterate over them and delete them +// We add a heuristic to skip over a module's keys during pruning if it hasn't been updated +// since the last time pruning occurred. +// NOTE: There is a rare case when a module's keys are skipped during pruning even though +// it has been updated. This occurs when that module's keys are updated in between pruning runs, the node after is restarted. +// This is not a large issue given the next time that module is updated, it will be properly pruned thereafter. +func (db *Database) pruneDescending(version int64) (_err error) { + // Defensive check: ensure database is not closed + if db.storage == nil { + return errors.New("pebbledb: database is closed") + } + + startTime := time.Now() + defer func() { + otelMetrics.pruneLatency.Record( + context.Background(), + time.Since(startTime).Seconds(), + metric.WithAttributes( + attribute.Bool("success", _err == nil), + ), + ) + }() + + earliestVersion := version + 1 // we increment by 1 to include the provided version + + itr, err := db.storage.NewIter(nil) + if err != nil { + return err + } + defer func() { _ = itr.Close() }() + + batch := db.storage.NewBatch() + defer func() { _ = batch.Close() }() + + var ( + counter int + prevKey []byte + keptBelowPrune bool + prevStore string + ) + + for itr.First(); itr.Valid(); { + currKeyEncoded := slices.Clone(itr.Key()) + + // Ignore metadata entries during pruning + if isMetadataKey(currKeyEncoded) { + itr.Next() + continue + } + + // Store current key and version + currKey, currVersion, currOK := SplitMVCCKey(currKeyEncoded) + if !currOK { + return fmt.Errorf("invalid MVCC key") + } + + storeKey, err := parseStoreKey(currKey) + if err != nil { + // XXX: This should never happen given we skip the metadata keys. + return err + } + + // For every new module visited, check to see last time it was updated + if storeKey != prevStore { + prevStore = storeKey + updated, ok := db.storeKeyDirty.Load(storeKey) + versionUpdated, typeOk := updated.(int64) + // Skip a store's keys if version it was last updated is less than last prune height + if !ok || (typeOk && versionUpdated < db.GetEarliestVersion()) { + itr.SeekGE(storePrefix(storeKey + "0")) + continue + } + } + + currVersionDecoded, err := decodeUint64Descending(currVersion) + if err != nil { + return err + } + + // Reset per-logical-key state when the logical key changes. + if !bytes.Equal(prevKey, currKey) { + prevKey = slices.Clone(currKey) + keptBelowPrune = false + + // Fast path: under descending encoding, versions of a key are stored + // newest-first. When the newest real version is above the prune + // height, seek directly to the first version <= prune height for + // this key instead of iterating through every above-prune version. + if currVersionDecoded > version { + itr.SeekGE(MVCCEncodeDescending(currKey, version)) + continue + } + } + + // Descending iteration: for a given logical key we see newest→oldest. + // Versions > prune height are always kept. For versions <= prune + // height, keep only the newest one when KeepLastVersion is true; + // delete every other such version. + if currVersionDecoded <= version { + if db.config.KeepLastVersion && !keptBelowPrune { + keptBelowPrune = true + } else { + if err := batch.Delete(currKeyEncoded, nil); err != nil { + return err + } + counter++ + if counter >= PruneCommitBatchSize { + if err := batch.Commit(defaultWriteOpts); err != nil { + return err + } + counter = 0 + batch.Reset() + } + } + } + + itr.Next() + } + + // Commit any leftover delete ops in batch + if counter > 0 { + err = batch.Commit(defaultWriteOpts) + if err != nil { + return err + } + } + + return db.SetEarliestVersion(earliestVersion, false) +} + +func (db *Database) iteratorDescending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { + return nil, errorutils.ErrKeyEmpty + } + + if start != nil && end != nil && bytes.Compare(start, end) > 0 { + return nil, errorutils.ErrStartAfterEnd + } + + lowerBound := MVCCEncodeDescending(prependStoreKey(storeKey, start), 0) + + var upperBound []byte + if end != nil { + upperBound = MVCCEncodeDescending(prependStoreKey(storeKey, end), 0) + } else { + upperBound = iteratorUpperBoundForStoreDescending(storeKey) + } + + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil +} + +func (db *Database) reverseIteratorDescending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { + return nil, errorutils.ErrKeyEmpty + } + + if start != nil && end != nil && bytes.Compare(start, end) > 0 { + return nil, errorutils.ErrStartAfterEnd + } + + lowerBound := MVCCEncodeDescending(prependStoreKey(storeKey, start), 0) + + var upperBound []byte + if end != nil { + upperBound = MVCCEncodeDescending(prependStoreKey(storeKey, end), 0) + } else { + upperBound = MVCCEncodeDescending(prefixEnd(storePrefix(storeKey)), 0) + } + + itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + + return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil +} + +func getMVCCSliceDescending(db *pebble.DB, storeKey string, key []byte, version int64) (_ []byte, err error) { + prefixedKey := prependStoreKey(storeKey, key) + itr, err := db.NewIter(&pebble.IterOptions{ + LowerBound: MVCCEncodeDescending(prefixedKey, version), + UpperBound: iteratorUpperBoundForLogicalKeyDescending(prefixedKey), + }) + if err != nil { + return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) + } + defer func() { + err = errorutils.Join(err, itr.Close()) + }() + + if !itr.First() { + return nil, errorutils.ErrRecordNotFound + } + return decodeMVCCEntryDescending(itr.Key(), itr.Value(), prefixedKey, version) +} + +// decodeMVCCEntryDescending validates that the iterator's current entry +// belongs to prefixedKey at a version <= target and returns a safe copy of the +// value. Assumes descending version encoding. +func decodeMVCCEntryDescending(rawIterKey, rawIterValue, prefixedKey []byte, version int64) ([]byte, error) { + userKey, vBz, ok := SplitMVCCKey(rawIterKey) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) + } + if !bytes.Equal(userKey, prefixedKey) { + return nil, errorutils.ErrRecordNotFound + } + keyVersion, err := decodeUint64Descending(vBz) + if err != nil { + return nil, fmt.Errorf("failed to decode key version: %w", err) + } + if keyVersion > version { + return nil, errorutils.ErrRecordNotFound + } + return slices.Clone(rawIterValue), nil +} + +func visibleValueAtVersionDescending(prefixedVal []byte, targetVersion int64) ([]byte, error) { + valBz, tombBz, ok := SplitMVCCKey(prefixedVal) + if !ok { + return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) + } + if len(tombBz) == 0 { + return valBz, nil + } + tombstone, err := decodeUint64Descending(tombBz) + if err != nil { + return nil, fmt.Errorf("failed to decode value tombstone: %w", err) + } + if targetVersion < tombstone { + return valBz, nil + } + return nil, nil +} + +func iteratorUpperBoundForStoreDescending(storeKey string) []byte { + upperStorePrefix := prefixEnd(storePrefix(storeKey)) + if upperStorePrefix == nil { + return nil + } + return MVCCEncodeDescending(upperStorePrefix, 0) +} + +func iteratorUpperBoundForLogicalKeyDescending(key []byte) []byte { + upperKeyPrefix := prefixEnd(key) + if upperKeyPrefix == nil { + return nil + } + return MVCCEncodeDescending(upperKeyPrefix, 0) +} + // Taken from pebbledb prefix upper bound // Returns smallest key strictly greater than the prefix func prefixEnd(b []byte) []byte { diff --git a/sei-db/db_engine/pebbledb/mvcc/db_descending.go b/sei-db/db_engine/pebbledb/mvcc/db_descending.go deleted file mode 100644 index ad80bf2a2d..0000000000 --- a/sei-db/db_engine/pebbledb/mvcc/db_descending.go +++ /dev/null @@ -1,323 +0,0 @@ -package mvcc - -import ( - "bytes" - "context" - "errors" - "fmt" - "time" - - "github.com/cockroachdb/pebble/v2" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" - "golang.org/x/exp/slices" - - errorutils "github.com/sei-protocol/sei-chain/sei-db/common/errors" - "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" -) - -// This file contains the descending-version MVCC implementation used by DBs -// created by this build. It is the fast path: versions of a logical key sort -// newest-first on disk, so Pebble's First() / SeekGE lands directly on the -// latest visible version without iterating older ones. -// -// Callers go through the dispatchers in db.go; nothing here should be invoked -// directly by code outside the package. - -func (db *Database) hasDescending(storeKey string, version int64, key []byte) (bool, error) { - if version < db.GetEarliestVersion() { - return false, nil - } - - val, err := db.getDescending(storeKey, version, key) - if err != nil { - return false, err - } - - return val != nil, nil -} - -func (db *Database) getDescending(storeKey string, targetVersion int64, key []byte) (_ []byte, _err error) { - startTime := time.Now() - defer func() { - otelMetrics.getLatency.Record( - context.Background(), - time.Since(startTime).Seconds(), - metric.WithAttributes( - attribute.Bool("success", _err == nil), - attribute.String("store", storeKey), - ), - ) - }() - if targetVersion < db.GetEarliestVersion() { - return nil, nil - } - - prefixedVal, err := getMVCCSliceDescending(db.storage, storeKey, key, targetVersion) - if err != nil { - if errors.Is(err, errorutils.ErrRecordNotFound) { - return nil, nil - } - - return nil, fmt.Errorf("failed to perform PebbleDB read: %w", err) - } - - return visibleValueAtVersionDescending(prefixedVal, targetVersion) -} - -// pruneDescending attempts to prune all versions up to and including the current version -// Get the range of keys, manually iterate over them and delete them -// We add a heuristic to skip over a module's keys during pruning if it hasn't been updated -// since the last time pruning occurred. -// NOTE: There is a rare case when a module's keys are skipped during pruning even though -// it has been updated. This occurs when that module's keys are updated in between pruning runs, the node after is restarted. -// This is not a large issue given the next time that module is updated, it will be properly pruned thereafter. -func (db *Database) pruneDescending(version int64) (_err error) { - // Defensive check: ensure database is not closed - if db.storage == nil { - return errors.New("pebbledb: database is closed") - } - - startTime := time.Now() - defer func() { - otelMetrics.pruneLatency.Record( - context.Background(), - time.Since(startTime).Seconds(), - metric.WithAttributes( - attribute.Bool("success", _err == nil), - ), - ) - }() - - earliestVersion := version + 1 // we increment by 1 to include the provided version - - itr, err := db.storage.NewIter(nil) - if err != nil { - return err - } - defer func() { _ = itr.Close() }() - - batch := db.storage.NewBatch() - defer func() { _ = batch.Close() }() - - var ( - counter int - prevKey []byte - keptBelowPrune bool - prevStore string - ) - - for itr.First(); itr.Valid(); { - currKeyEncoded := slices.Clone(itr.Key()) - - // Ignore metadata entries during pruning - if isMetadataKey(currKeyEncoded) { - itr.Next() - continue - } - - // Store current key and version - currKey, currVersion, currOK := SplitMVCCKey(currKeyEncoded) - if !currOK { - return fmt.Errorf("invalid MVCC key") - } - - storeKey, err := parseStoreKey(currKey) - if err != nil { - // XXX: This should never happen given we skip the metadata keys. - return err - } - - // For every new module visited, check to see last time it was updated - if storeKey != prevStore { - prevStore = storeKey - updated, ok := db.storeKeyDirty.Load(storeKey) - versionUpdated, typeOk := updated.(int64) - // Skip a store's keys if version it was last updated is less than last prune height - if !ok || (typeOk && versionUpdated < db.GetEarliestVersion()) { - itr.SeekGE(storePrefix(storeKey + "0")) - continue - } - } - - currVersionDecoded, err := decodeUint64Descending(currVersion) - if err != nil { - return err - } - - // Reset per-logical-key state when the logical key changes. - if !bytes.Equal(prevKey, currKey) { - prevKey = slices.Clone(currKey) - keptBelowPrune = false - - // Fast path: under descending encoding, versions of a key are stored - // newest-first. When the newest real version is above the prune - // height, seek directly to the first version <= prune height for - // this key instead of iterating through every above-prune version. - if currVersionDecoded > version { - itr.SeekGE(MVCCEncodeDescending(currKey, version)) - continue - } - } - - // Descending iteration: for a given logical key we see newest→oldest. - // Versions > prune height are always kept. For versions <= prune - // height, keep only the newest one when KeepLastVersion is true; - // delete every other such version. - if currVersionDecoded <= version { - if db.config.KeepLastVersion && !keptBelowPrune { - keptBelowPrune = true - } else { - if err := batch.Delete(currKeyEncoded, nil); err != nil { - return err - } - counter++ - if counter >= PruneCommitBatchSize { - if err := batch.Commit(defaultWriteOpts); err != nil { - return err - } - counter = 0 - batch.Reset() - } - } - } - - itr.Next() - } - - // Commit any leftover delete ops in batch - if counter > 0 { - err = batch.Commit(defaultWriteOpts) - if err != nil { - return err - } - } - - return db.SetEarliestVersion(earliestVersion, false) -} - -func (db *Database) iteratorDescending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { - return nil, errorutils.ErrKeyEmpty - } - - if start != nil && end != nil && bytes.Compare(start, end) > 0 { - return nil, errorutils.ErrStartAfterEnd - } - - lowerBound := MVCCEncodeDescending(prependStoreKey(storeKey, start), 0) - - var upperBound []byte - if end != nil { - upperBound = MVCCEncodeDescending(prependStoreKey(storeKey, end), 0) - } else { - upperBound = iteratorUpperBoundForStoreDescending(storeKey) - } - - itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) - if err != nil { - return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) - } - - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), false, storeKey), nil -} - -func (db *Database) reverseIteratorDescending(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { - if (start != nil && len(start) == 0) || (end != nil && len(end) == 0) { - return nil, errorutils.ErrKeyEmpty - } - - if start != nil && end != nil && bytes.Compare(start, end) > 0 { - return nil, errorutils.ErrStartAfterEnd - } - - lowerBound := MVCCEncodeDescending(prependStoreKey(storeKey, start), 0) - - var upperBound []byte - if end != nil { - upperBound = MVCCEncodeDescending(prependStoreKey(storeKey, end), 0) - } else { - upperBound = MVCCEncodeDescending(prefixEnd(storePrefix(storeKey)), 0) - } - - itr, err := db.storage.NewIter(&pebble.IterOptions{LowerBound: lowerBound, UpperBound: upperBound}) - if err != nil { - return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) - } - - return newPebbleDBIterator(itr, storePrefix(storeKey), start, end, version, db.GetEarliestVersion(), true, storeKey), nil -} - -func getMVCCSliceDescending(db *pebble.DB, storeKey string, key []byte, version int64) (_ []byte, err error) { - prefixedKey := prependStoreKey(storeKey, key) - itr, err := db.NewIter(&pebble.IterOptions{ - LowerBound: MVCCEncodeDescending(prefixedKey, version), - UpperBound: iteratorUpperBoundForLogicalKeyDescending(prefixedKey), - }) - if err != nil { - return nil, fmt.Errorf("failed to create PebbleDB iterator: %w", err) - } - defer func() { - err = errorutils.Join(err, itr.Close()) - }() - - if !itr.First() { - return nil, errorutils.ErrRecordNotFound - } - return decodeMVCCEntryDescending(itr.Key(), itr.Value(), prefixedKey, version) -} - -// decodeMVCCEntryDescending validates that the iterator's current entry -// belongs to prefixedKey at a version <= target and returns a safe copy of the -// value. Assumes descending version encoding. -func decodeMVCCEntryDescending(rawIterKey, rawIterValue, prefixedKey []byte, version int64) ([]byte, error) { - userKey, vBz, ok := SplitMVCCKey(rawIterKey) - if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC key: %s", rawIterKey) - } - if !bytes.Equal(userKey, prefixedKey) { - return nil, errorutils.ErrRecordNotFound - } - keyVersion, err := decodeUint64Descending(vBz) - if err != nil { - return nil, fmt.Errorf("failed to decode key version: %w", err) - } - if keyVersion > version { - return nil, errorutils.ErrRecordNotFound - } - return slices.Clone(rawIterValue), nil -} - -func visibleValueAtVersionDescending(prefixedVal []byte, targetVersion int64) ([]byte, error) { - valBz, tombBz, ok := SplitMVCCKey(prefixedVal) - if !ok { - return nil, fmt.Errorf("invalid PebbleDB MVCC value: %s", prefixedVal) - } - if len(tombBz) == 0 { - return valBz, nil - } - tombstone, err := decodeUint64Descending(tombBz) - if err != nil { - return nil, fmt.Errorf("failed to decode value tombstone: %w", err) - } - if targetVersion < tombstone { - return valBz, nil - } - return nil, nil -} - -func iteratorUpperBoundForStoreDescending(storeKey string) []byte { - upperStorePrefix := prefixEnd(storePrefix(storeKey)) - if upperStorePrefix == nil { - return nil - } - return MVCCEncodeDescending(upperStorePrefix, 0) -} - -func iteratorUpperBoundForLogicalKeyDescending(key []byte) []byte { - upperKeyPrefix := prefixEnd(key) - if upperKeyPrefix == nil { - return nil - } - return MVCCEncodeDescending(upperKeyPrefix, 0) -} diff --git a/sei-db/db_engine/pebbledb/mvcc/iterator_descending.go b/sei-db/db_engine/pebbledb/mvcc/iterator.go similarity index 100% rename from sei-db/db_engine/pebbledb/mvcc/iterator_descending.go rename to sei-db/db_engine/pebbledb/mvcc/iterator.go From 678f48863af2d43b8d56769ae39f9d2c63256375 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 15:27:23 -0400 Subject: [PATCH 14/15] refactor: extract retrieveVersionKey and writeBatchOps helpers Two identical code paths were waiting for a single kernel: - retrieveLatestVersion and retrieveEarliestVersion differed only in the metadata key name. Fold both into retrieveVersionKey(db, key). - Batch.Write and RawBatch.Write share the otel metrics defer, the pebble batch open/close, sortBatchOps, and the ops-application loop. The only delta is the latestVersionKey stamp in Batch.Write. Extract writeBatchOps(storage, ops, beforeCommit) and pass the stamp as a hook from Batch.Write; RawBatch.Write passes nil. No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/db_engine/pebbledb/mvcc/batch.go | 74 +++++++++---------------- sei-db/db_engine/pebbledb/mvcc/db.go | 36 ++++-------- 2 files changed, 39 insertions(+), 71 deletions(-) diff --git a/sei-db/db_engine/pebbledb/mvcc/batch.go b/sei-db/db_engine/pebbledb/mvcc/batch.go index bcb5e3f5d4..3930a18874 100644 --- a/sei-db/db_engine/pebbledb/mvcc/batch.go +++ b/sei-db/db_engine/pebbledb/mvcc/batch.go @@ -67,45 +67,15 @@ func (b *Batch) Delete(storeKey string, key []byte) error { return b.set(storeKey, b.version, key, []byte(tombstoneVal)) } -func (b *Batch) Write() (err error) { - startTime := time.Now() - batchSize := int64(len(b.ops)) - - defer func() { - ctx := context.Background() - otelMetrics.batchWriteLatency.Record( - ctx, - time.Since(startTime).Seconds(), - metric.WithAttributes(attribute.Bool("success", err == nil)), - ) - otelMetrics.batchSize.Record( - ctx, - batchSize, - ) - }() - - batch := b.storage.NewBatch() - defer func() { - err = errors.Join(err, batch.Close()) - }() - sortBatchOps(b.ops) - for _, op := range b.ops { - if op.delete { - if e := batch.Delete(op.key, nil); e != nil { - return fmt.Errorf("failed to delete in PebbleDB batch: %w", e) - } - continue +func (b *Batch) Write() error { + return writeBatchOps(b.storage, b.ops, func(batch *pebble.Batch) error { + var versionBz [VersionSize]byte + binary.LittleEndian.PutUint64(versionBz[:], uint64(b.version)) //nolint:gosec // block heights are non-negative and fit in int64 + if err := batch.Set([]byte(latestVersionKey), versionBz[:], nil); err != nil { + return fmt.Errorf("failed to set latest version in batch: %w", err) } - if e := batch.Set(op.key, op.value, nil); e != nil { - return fmt.Errorf("failed to write PebbleDB batch: %w", e) - } - } - var versionBz [VersionSize]byte - binary.LittleEndian.PutUint64(versionBz[:], uint64(b.version)) //nolint:gosec // block heights are non-negative and fit in int64 - if err := batch.Set([]byte(latestVersionKey), versionBz[:], nil); err != nil { - return fmt.Errorf("failed to set latest version in batch: %w", err) - } - return batch.Commit(defaultWriteOpts) + return nil + }) } // For writing kv pairs in any order of version @@ -162,9 +132,17 @@ func (b *Batch) HardDelete(storeKey string, key []byte) error { return nil } -func (b *RawBatch) Write() (err error) { +func (b *RawBatch) Write() error { + return writeBatchOps(b.storage, b.ops, nil) +} + +// writeBatchOps applies ops to a new pebble batch in sorted order, records +// otel metrics, and commits. The optional beforeCommit hook runs on the +// pebble batch right before commit (used by Batch.Write to stamp the +// latest-version metadata key). +func writeBatchOps(storage *pebble.DB, ops []batchOp, beforeCommit func(*pebble.Batch) error) (err error) { startTime := time.Now() - batchSize := int64(len(b.ops)) + batchSize := int64(len(ops)) defer func() { ctx := context.Background() otelMetrics.batchWriteLatency.Record( @@ -172,18 +150,15 @@ func (b *RawBatch) Write() (err error) { time.Since(startTime).Seconds(), metric.WithAttributes(attribute.Bool("success", err == nil)), ) - otelMetrics.batchSize.Record( - ctx, - batchSize, - ) + otelMetrics.batchSize.Record(ctx, batchSize) }() - batch := b.storage.NewBatch() + batch := storage.NewBatch() defer func() { err = errors.Join(err, batch.Close()) }() - sortBatchOps(b.ops) - for _, op := range b.ops { + sortBatchOps(ops) + for _, op := range ops { if op.delete { if e := batch.Delete(op.key, nil); e != nil { return fmt.Errorf("failed to delete in PebbleDB batch: %w", e) @@ -194,6 +169,11 @@ func (b *RawBatch) Write() (err error) { return fmt.Errorf("failed to write PebbleDB batch: %w", e) } } + if beforeCommit != nil { + if err := beforeCommit(batch); err != nil { + return err + } + } return batch.Commit(defaultWriteOpts) } diff --git a/sei-db/db_engine/pebbledb/mvcc/db.go b/sei-db/db_engine/pebbledb/mvcc/db.go index bf4a7fccba..5f83cb2dab 100644 --- a/sei-db/db_engine/pebbledb/mvcc/db.go +++ b/sei-db/db_engine/pebbledb/mvcc/db.go @@ -307,24 +307,7 @@ func detectMVCCMode(db *pebble.DB) (bool, error) { } func retrieveLatestVersion(db *pebble.DB) (int64, error) { - bz, closer, err := db.Get([]byte(latestVersionKey)) - defer func() { - if closer != nil { - _ = closer.Close() - } - }() - if err != nil || len(bz) == 0 { - if errors.Is(err, pebble.ErrNotFound) { - return 0, nil - } - return 0, err - } - - uz := binary.LittleEndian.Uint64(bz) - if uz > math.MaxInt64 { - return 0, fmt.Errorf("latest version in database overflows int64: %d", uz) - } - return int64(uz), nil + return retrieveVersionKey(db, latestVersionKey) } func (db *Database) SetEarliestVersion(version int64, ignoreVersion bool) error { @@ -351,7 +334,13 @@ func (db *Database) GetEarliestVersion() int64 { // Retrieves earliest version from db, if not found, return 0 func retrieveEarliestVersion(db *pebble.DB) (int64, error) { - bz, closer, err := db.Get([]byte(earliestVersionKey)) + return retrieveVersionKey(db, earliestVersionKey) +} + +// retrieveVersionKey reads a little-endian uint64 version from the given +// metadata key. Returns 0 when the key is absent (fresh DB). +func retrieveVersionKey(db *pebble.DB, key string) (int64, error) { + bz, closer, err := db.Get([]byte(key)) defer func() { if closer != nil { _ = closer.Close() @@ -363,12 +352,11 @@ func retrieveEarliestVersion(db *pebble.DB) (int64, error) { } return 0, err } - - ubz := binary.LittleEndian.Uint64(bz) - if ubz > math.MaxInt64 { - return 0, fmt.Errorf("earliest version in database overflows int64: %d", ubz) + u := binary.LittleEndian.Uint64(bz) + if u > math.MaxInt64 { + return 0, fmt.Errorf("version at %q overflows int64: %d", key, u) } - return int64(ubz), nil + return int64(u), nil } // Has dispatches between descending- and ascending-mode implementations From 39ae1660f129d5b6b0397164e0d0c565c0ec1ce6 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Fri, 17 Apr 2026 16:07:50 -0400 Subject: [PATCH 15/15] chore: drop stray blank line in evmrpc/simulate.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Leftover from the earlier replay-cache revert — the one-line whitespace delta was the only difference from main in this file. Match main exactly so simulate.go drops out of the PR's diff stat. Co-Authored-By: Claude Opus 4.7 (1M context) --- evmrpc/simulate.go | 1 - 1 file changed, 1 deletion(-) diff --git a/evmrpc/simulate.go b/evmrpc/simulate.go index 4f0267ae0f..8da2dd2f1c 100644 --- a/evmrpc/simulate.go +++ b/evmrpc/simulate.go @@ -490,7 +490,6 @@ func (b *Backend) ReplayTransactionTillIndex(ctx context.Context, block *ethtype if txIndex < 0 { return state.NewDBImpl(sdkCtx.WithIsEVM(true), b.keeper, true), tmBlock.Block.Txs, nil } - for idx, tx := range tmBlock.Block.Txs { if idx > txIndex { break