Skip to content

Commit 561c093

Browse files
joshuacolvin0claude
andcommitted
rawdb: add freezer safety margin to prevent data loss after unclean shutdown
Add a safety margin (freezerBatchLimit = 30,000 blocks) before deleting frozen blocks from LevelDB. This ensures that after an unclean shutdown, when repair() truncates unflushed freezer writes, the data still exists in LevelDB for re-freezing. Critical for L2 nodes that cannot re-download blocks from peers. Refuse to start when data loss exceeds the safety margin, with a clear error message telling the operator to restore from a snapshot. Add edge case tests for the safety margin cleanup logic. Disk overhead: ~30K blocks duplicated temporarily (~30-600 MB). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ceea349 commit 561c093

5 files changed

Lines changed: 785 additions & 43 deletions

File tree

core/rawdb/accessors_chain.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,26 @@ func WriteLastPivotNumber(db ethdb.KeyValueWriter, pivot uint64) {
258258
}
259259
}
260260

261+
// ReadFreezerCleanupTail retrieves the block number up to which frozen blocks
262+
// have been cleaned up (deleted) from the key-value database.
263+
func ReadFreezerCleanupTail(db ethdb.KeyValueReader) (uint64, bool) {
264+
data, _ := db.Get(freezerCleanupTailKey)
265+
if len(data) != 8 {
266+
return 0, false
267+
}
268+
return binary.BigEndian.Uint64(data), true
269+
}
270+
271+
// WriteFreezerCleanupTail stores the block number up to which frozen blocks
272+
// have been cleaned up from the key-value database.
273+
func WriteFreezerCleanupTail(db ethdb.KeyValueWriter, number uint64) {
274+
var buf [8]byte
275+
binary.BigEndian.PutUint64(buf[:], number)
276+
if err := db.Put(freezerCleanupTailKey, buf[:]); err != nil {
277+
log.Crit("Failed to store freezer cleanup tail", "err", err)
278+
}
279+
}
280+
261281
// ReadTxIndexTail retrieves the number of oldest indexed block
262282
// whose transaction indices has been indexed.
263283
func ReadTxIndexTail(db ethdb.KeyValueReader) *uint64 {

core/rawdb/chain_freezer.go

Lines changed: 79 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,21 @@ const (
4040
freezerBatchLimit = 30000
4141
)
4242

43+
// freezerCleanupMargin is the number of blocks to keep in the key-value
44+
// database after they have been frozen into the ancient store. This acts
45+
// as a safety margin: after an unclean shutdown, repair() truncates
46+
// unflushed freezer writes. The data still exists in LevelDB and can be
47+
// re-frozen. Without this margin, a crash could leave blocks missing
48+
// from both stores, making the node unable to start (especially for L2
49+
// nodes without peers).
50+
//
51+
// Set to freezerBatchLimit because each freeze cycle writes at most
52+
// freezerBatchLimit blocks to the ancient store. If the node crashes
53+
// before these writes are fsynced (SyncAncient), repair() may truncate
54+
// the unflushed entries on restart. Keeping this many blocks in LevelDB
55+
// ensures the truncated data can be re-frozen.
56+
var freezerCleanupMargin uint64 = freezerBatchLimit
57+
4358
// chainFreezer is a wrapper of chain ancient store with additional chain freezing
4459
// feature. The background thread will keep moving ancient chain segments from
4560
// key-value database to flat files for saving space on live database.
@@ -220,13 +235,37 @@ func (f *chainFreezer) freeze(db ethdb.KeyValueStore) {
220235
if err := f.SyncAncient(); err != nil {
221236
log.Crit("Failed to flush frozen tables", "err", err)
222237
}
223-
// Wipe out all data from the active database
238+
// Delete blocks from LevelDB that are safely behind the freeze point.
239+
// Recently-frozen blocks are kept in both stores so that after an
240+
// unclean shutdown, repair()-truncated entries can be re-frozen.
241+
frozen, _ = f.Ancients() // no error will occur, safe to ignore; reload after freezeRange
242+
cleanupStart := uint64(1) // always keep genesis
243+
if prev, ok := ReadFreezerCleanupTail(db); ok && prev > 1 {
244+
cleanupStart = prev
245+
} else if frozen > freezerCleanupMargin {
246+
// First run with safety margin: prior code already deleted frozen
247+
// blocks from LevelDB immediately, so skip ahead to avoid pointless
248+
// reads of already-deleted blocks.
249+
cleanupStart = frozen - freezerCleanupMargin
250+
WriteFreezerCleanupTail(db, cleanupStart)
251+
}
252+
cleanupLimit := uint64(0)
253+
if frozen > freezerCleanupMargin {
254+
cleanupLimit = frozen - freezerCleanupMargin
255+
}
256+
// Cap per-cycle work to avoid stalling when cleanup has a large backlog
257+
// (e.g., first run after upgrade). During catch-up the node keeps more
258+
// blocks in LevelDB than ultimately needed (extra disk, not extra latency).
259+
if cleanupLimit > cleanupStart+freezerBatchLimit {
260+
cleanupLimit = cleanupStart + freezerBatchLimit
261+
}
262+
// Wipe out canonical data from the active database.
224263
batch := db.NewBatch()
225-
for i := 0; i < len(ancients); i++ {
226-
// Always keep the genesis block in active database
227-
if first+uint64(i) != 0 {
228-
DeleteBlockWithoutNumber(batch, ancients[i], first+uint64(i))
229-
DeleteCanonicalHash(batch, first+uint64(i))
264+
for number := cleanupStart; number < cleanupLimit; number++ {
265+
hash := ReadCanonicalHash(nfdb, number)
266+
if hash != (common.Hash{}) {
267+
DeleteBlockWithoutNumber(batch, hash, number)
268+
DeleteCanonicalHash(batch, number)
230269
}
231270
}
232271
if err := batch.Write(); err != nil {
@@ -236,15 +275,11 @@ func (f *chainFreezer) freeze(db ethdb.KeyValueStore) {
236275

237276
// Wipe out side chains also and track dangling side chains
238277
var dangling []common.Hash
239-
frozen, _ = f.Ancients() // Needs reload after during freezeRange
240-
for number := first; number < frozen; number++ {
241-
// Always keep the genesis block in active database
242-
if number != 0 {
243-
dangling = ReadAllHashes(db, number)
244-
for _, hash := range dangling {
245-
log.Trace("Deleting side chain", "number", number, "hash", hash)
246-
DeleteBlock(batch, hash, number)
247-
}
278+
for number := cleanupStart; number < cleanupLimit; number++ {
279+
dangling = ReadAllHashes(db, number)
280+
for _, hash := range dangling {
281+
log.Trace("Deleting side chain", "number", number, "hash", hash)
282+
DeleteBlock(batch, hash, number)
248283
}
249284
}
250285
if err := batch.Write(); err != nil {
@@ -253,37 +288,38 @@ func (f *chainFreezer) freeze(db ethdb.KeyValueStore) {
253288
batch.Reset()
254289

255290
// Step into the future and delete any dangling side chains
256-
if frozen > 0 {
257-
tip := frozen
258-
for len(dangling) > 0 {
259-
drop := make(map[common.Hash]struct{})
260-
for _, hash := range dangling {
261-
log.Debug("Dangling parent from Freezer", "number", tip-1, "hash", hash)
262-
drop[hash] = struct{}{}
291+
tip := cleanupLimit
292+
for len(dangling) > 0 {
293+
drop := make(map[common.Hash]struct{})
294+
for _, hash := range dangling {
295+
log.Debug("Dangling parent from Freezer", "number", tip-1, "hash", hash)
296+
drop[hash] = struct{}{}
297+
}
298+
children := ReadAllHashes(db, tip)
299+
for i := 0; i < len(children); i++ {
300+
// Dig up the child and ensure it's dangling
301+
child := ReadHeader(nfdb, children[i], tip)
302+
if child == nil {
303+
log.Error("Missing dangling header", "number", tip, "hash", children[i])
304+
continue
263305
}
264-
children := ReadAllHashes(db, tip)
265-
for i := 0; i < len(children); i++ {
266-
// Dig up the child and ensure it's dangling
267-
child := ReadHeader(nfdb, children[i], tip)
268-
if child == nil {
269-
log.Error("Missing dangling header", "number", tip, "hash", children[i])
270-
continue
271-
}
272-
if _, ok := drop[child.ParentHash]; !ok {
273-
children = append(children[:i], children[i+1:]...)
274-
i--
275-
continue
276-
}
277-
// Delete all block data associated with the child
278-
log.Debug("Deleting dangling block", "number", tip, "hash", children[i], "parent", child.ParentHash)
279-
DeleteBlock(batch, children[i], tip)
306+
if _, ok := drop[child.ParentHash]; !ok {
307+
children = append(children[:i], children[i+1:]...)
308+
i--
309+
continue
280310
}
281-
dangling = children
282-
tip++
283-
}
284-
if err := batch.Write(); err != nil {
285-
log.Crit("Failed to delete dangling side blocks", "err", err)
311+
// Delete all block data associated with the child
312+
log.Debug("Deleting dangling block", "number", tip, "hash", children[i], "parent", child.ParentHash)
313+
DeleteBlock(batch, children[i], tip)
286314
}
315+
dangling = children
316+
tip++
317+
}
318+
if err := batch.Write(); err != nil {
319+
log.Crit("Failed to delete dangling side blocks", "err", err)
320+
}
321+
if cleanupStart < cleanupLimit {
322+
WriteFreezerCleanupTail(db, cleanupLimit)
287323
}
288324

289325
// Log something friendly for the user

0 commit comments

Comments
 (0)