Skip to content

Commit c14030d

Browse files
LukaszRozmejclaude
andcommitted
fix(db): cross-column snapshot atomicity in SnapshotableMemColumnsDb
SnapshotableMemColumnsDb.CreateSnapshot iterates columns in a loop and captures each column's snapshot independently. InMemoryColumnWriteBatch .Dispose iterates the per-column batches and disposes each independently. Without a global guard, a CreateSnapshot call concurrent with an in-flight writeBatch.Dispose can capture some columns AFTER the new writes and others BEFORE — producing a cross-column-inconsistent reader view. This race only matters in the test backend (RocksDB has atomic cross-CF snapshots), but it manifested concretely in E2ESyncTests.SnapSync on Windows: post-snap-sync block processing leases a fresh persistenceReader while the persistence pipeline is committing, gets a snapshot whose Accounts column is updated but whose StateNodes column is not, then walks the trie at the new state root and throws TrieNodeException for a node that "should" be there. Add a ReaderWriterLockSlim around CreateSnapshot (read) and writeBatch.Dispose (write) so multi-column commits are atomic w.r.t. snapshot creation. Multiple snapshots can still proceed concurrently — the only contention is the rare overlap of a snapshot creation with a multi-column commit. This eliminates the TrieNodeException failure mode in the stress reproducer (`SnapSync_StressRepro`). The "missing in flat" mode is a separate, deterministic bug (the same ~9 addresses miss reliably) that appears unaffected by this fix and needs follow-up investigation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent edc0d3d commit c14030d

1 file changed

Lines changed: 33 additions & 1 deletion

File tree

src/Nethermind/Nethermind.Db/SnapshotableMemColumnsDb.cs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
using System;
55
using System.Collections.Generic;
6+
using System.Threading;
67
using Nethermind.Core;
78

89
namespace Nethermind.Db
@@ -16,6 +17,16 @@ public class SnapshotableMemColumnsDb<TKey> : IColumnsDb<TKey> where TKey : stru
1617
private readonly Dictionary<TKey, SnapshotableMemDb> _columnDbs = new();
1718
private readonly bool _neverPrune;
1819

20+
// Cross-column atomicity guard. Each per-column SnapshotableMemDb has its own version
21+
// counter and lock, so per-column reads/writes are individually consistent. But a
22+
// multi-column writeBatch dispose applies columns one-by-one, and CreateSnapshot
23+
// captures column snapshots one-by-one. Without this lock a snapshot taken concurrently
24+
// with an in-flight writeBatch dispose can capture some columns AFTER the new writes
25+
// and others BEFORE, producing a cross-column-inconsistent reader view. RocksDB does
26+
// not have this problem (its snapshots are atomic across CFs); this lock makes the
27+
// in-memory test backend match.
28+
private readonly Lock _atomicityLock = new();
29+
1930
private SnapshotableMemColumnsDb(TKey[] keys, bool neverPrune)
2031
{
2132
_neverPrune = neverPrune;
@@ -55,10 +66,11 @@ public IDb GetColumnDb(TKey key)
5566

5667
public IReadOnlyColumnDb<TKey> CreateReadOnly(bool createInMemWriteStore) => new ReadOnlyColumnsDb<TKey>(this, createInMemWriteStore);
5768

58-
public IColumnsWriteBatch<TKey> StartWriteBatch() => new InMemoryColumnWriteBatch<TKey>(this);
69+
public IColumnsWriteBatch<TKey> StartWriteBatch() => new AtomicColumnsWriteBatch(this);
5970

6071
public IColumnDbSnapshot<TKey> CreateSnapshot()
6172
{
73+
using Lock.Scope _ = _atomicityLock.EnterScope();
6274
Dictionary<TKey, IKeyValueStoreSnapshot> snapshots = new();
6375
foreach (KeyValuePair<TKey, SnapshotableMemDb> kvp in _columnDbs)
6476
{
@@ -67,6 +79,26 @@ public IColumnDbSnapshot<TKey> CreateSnapshot()
6779
return new ColumnSnapshot(snapshots);
6880
}
6981

82+
/// <summary>
83+
/// Wraps <see cref="InMemoryColumnWriteBatch{TKey}"/> so the per-column commit phase
84+
/// happens under the columns DB's write lock, making the multi-column commit atomic
85+
/// w.r.t. <see cref="CreateSnapshot"/>.
86+
/// </summary>
87+
private sealed class AtomicColumnsWriteBatch(SnapshotableMemColumnsDb<TKey> db) : IColumnsWriteBatch<TKey>
88+
{
89+
private readonly InMemoryColumnWriteBatch<TKey> _inner = new(db);
90+
91+
public IWriteBatch GetColumnBatch(TKey key) => _inner.GetColumnBatch(key);
92+
93+
public void Clear() => _inner.Clear();
94+
95+
public void Dispose()
96+
{
97+
using Lock.Scope _ = db._atomicityLock.EnterScope();
98+
_inner.Dispose();
99+
}
100+
}
101+
70102
public void Dispose()
71103
{
72104
foreach (SnapshotableMemDb db in _columnDbs.Values)

0 commit comments

Comments
 (0)