Skip to content

Commit cb6db14

Browse files
committed
feat(log): watchdog-warn when LevelDB open stalls
1 parent d6065a4 commit cb6db14

2 files changed

Lines changed: 151 additions & 0 deletions

File tree

chainbase/src/main/java/org/tron/common/storage/leveldb/LevelDbDataSourceImpl.java

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
import com.google.common.annotations.VisibleForTesting;
2121
import com.google.common.collect.Sets;
2222
import com.google.common.primitives.Bytes;
23+
import java.io.File;
2324
import java.io.IOException;
25+
import java.nio.charset.StandardCharsets;
2426
import java.nio.file.Files;
2527
import java.nio.file.Path;
2628
import java.nio.file.Paths;
@@ -29,9 +31,14 @@
2931
import java.util.Collections;
3032
import java.util.HashMap;
3133
import java.util.List;
34+
import java.util.Locale;
3235
import java.util.Map;
3336
import java.util.Map.Entry;
3437
import java.util.Set;
38+
import java.util.concurrent.ScheduledExecutorService;
39+
import java.util.concurrent.ScheduledFuture;
40+
import java.util.concurrent.TimeUnit;
41+
import java.util.concurrent.atomic.AtomicReference;
3542
import java.util.concurrent.locks.ReadWriteLock;
3643
import java.util.concurrent.locks.ReentrantReadWriteLock;
3744
import java.util.stream.Collectors;
@@ -45,6 +52,7 @@
4552
import org.iq80.leveldb.ReadOptions;
4653
import org.iq80.leveldb.WriteBatch;
4754
import org.iq80.leveldb.WriteOptions;
55+
import org.tron.common.es.ExecutorServiceManager;
4856
import org.tron.common.parameter.CommonParameter;
4957
import org.tron.common.storage.WriteOptionsWrapper;
5058
import org.tron.common.storage.metric.DbStat;
@@ -61,6 +69,13 @@
6169
public class LevelDbDataSourceImpl extends DbStat implements DbSourceInter<byte[]>,
6270
Iterable<Entry<byte[], byte[]>>, Instance<LevelDbDataSourceImpl> {
6371

72+
/** First watchdog WARN fires this many seconds after factory.open() begins. */
73+
private static final long OPEN_WATCHDOG_INITIAL_DELAY_SEC = 60;
74+
/** Subsequent watchdog WARN lines are emitted on this interval. */
75+
private static final long OPEN_WATCHDOG_PERIOD_SEC = 30;
76+
/** Value of {@code Filename.currentFileName()}. */
77+
private static final String LEVELDB_CURRENT_FILE = "CURRENT";
78+
6479
private String dataBaseName;
6580
private DB database;
6681
private volatile boolean alive;
@@ -121,6 +136,15 @@ private void openDatabase(Options dbOptions) throws IOException {
121136
if (!Files.isSymbolicLink(dbPath.getParent())) {
122137
Files.createDirectories(dbPath.getParent());
123138
}
139+
final long openStartNs = System.nanoTime();
140+
final AtomicReference<String> manifestInfo = new AtomicReference<>();
141+
ScheduledExecutorService watchdog = ExecutorServiceManager
142+
.newSingleThreadScheduledExecutor("db-open-watchdog-" + dataBaseName, true);
143+
ScheduledFuture<?> watchdogTask = watchdog.scheduleAtFixedRate(
144+
() -> logSlowOpen(dbPath, openStartNs, manifestInfo),
145+
OPEN_WATCHDOG_INITIAL_DELAY_SEC,
146+
OPEN_WATCHDOG_PERIOD_SEC,
147+
TimeUnit.SECONDS);
124148
try {
125149
DbSourceInter.checkOrInitEngine(getEngine(), dbPath.toString(),
126150
TronError.ErrCode.LEVELDB_INIT);
@@ -139,7 +163,55 @@ private void openDatabase(Options dbOptions) throws IOException {
139163
logger.error("Open Database {} failed", dataBaseName, e);
140164
}
141165
throw new TronError(e, TronError.ErrCode.LEVELDB_INIT);
166+
} finally {
167+
watchdogTask.cancel(false);
168+
watchdog.shutdownNow();
169+
}
170+
}
171+
172+
/**
173+
* Emits a WARN when factory.open() is still blocked — usually because the
174+
* MANIFEST has grown large enough to make replay expensive.
175+
*/
176+
void logSlowOpen(Path dbPath, long startNs, AtomicReference<String> manifestInfoCache) {
177+
try {
178+
long elapsedSec = TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - startNs);
179+
String manifestInfo = manifestInfoCache.get();
180+
if (manifestInfo == null) {
181+
manifestInfo = resolveManifestInfo(dbPath.toFile());
182+
manifestInfoCache.compareAndSet(null, manifestInfo);
183+
}
184+
logger.warn("DB {} open still in progress after {}s. path={}, {}. "
185+
+ "This startup will complete; to speed up future restarts, run "
186+
+ "`java -jar Toolkit.jar db archive -d {}` before the next startup "
187+
+ "to rebuild the MANIFEST (the tool requires an exclusive DB lock, "
188+
+ "so it cannot run while the node is up).",
189+
dataBaseName, elapsedSec, dbPath, manifestInfo, parentPath);
190+
} catch (Exception e) {
191+
// Purely observational - never let the watchdog disrupt startup.
192+
logger.debug("db-open-watchdog failure for {}: {}", dataBaseName, e.getMessage());
193+
}
194+
}
195+
196+
private static String resolveManifestInfo(File dbDir) {
197+
File currentFile = new File(dbDir, LEVELDB_CURRENT_FILE);
198+
String name = "none";
199+
long size = 0;
200+
if (currentFile.isFile()) {
201+
try {
202+
name = new String(Files.readAllBytes(currentFile.toPath()),
203+
StandardCharsets.UTF_8).trim();
204+
File manifest = new File(dbDir, name);
205+
if (manifest.isFile()) {
206+
size = manifest.length();
207+
}
208+
} catch (IOException ignored) {
209+
// Best-effort — keep defaults. A new DB won't hit the 60s threshold
210+
// anyway, so reporting 0.00 MB here is the expected shape.
211+
}
142212
}
213+
return String.format(Locale.ROOT, "MANIFEST=%s (%.2f MB)", name,
214+
size / 1024.0 / 1024.0);
143215
}
144216

145217
public Path getDbPath() {

framework/src/test/java/org/tron/common/storage/leveldb/LevelDbDataSourceImplTest.java

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,23 @@
1919
package org.tron.common.storage.leveldb;
2020

2121
import static org.junit.Assert.assertEquals;
22+
import static org.junit.Assert.assertNotNull;
2223
import static org.junit.Assert.assertThrows;
24+
import static org.junit.Assert.assertTrue;
2325

26+
import ch.qos.logback.classic.Level;
27+
import ch.qos.logback.classic.Logger;
28+
import ch.qos.logback.classic.spi.ILoggingEvent;
29+
import ch.qos.logback.core.read.ListAppender;
2430
import java.io.File;
2531
import java.io.IOException;
32+
import java.nio.charset.StandardCharsets;
33+
import java.nio.file.Files;
34+
import java.nio.file.Path;
35+
import java.util.List;
36+
import java.util.concurrent.TimeUnit;
37+
import java.util.concurrent.atomic.AtomicReference;
38+
import java.util.stream.Collectors;
2639
import org.junit.AfterClass;
2740
import org.junit.Assert;
2841
import org.junit.Before;
@@ -32,11 +45,13 @@
3245
import org.junit.rules.ExpectedException;
3346
import org.junit.rules.TemporaryFolder;
3447
import org.rocksdb.RocksDB;
48+
import org.slf4j.LoggerFactory;
3549
import org.tron.common.TestConstants;
3650
import org.tron.common.parameter.CommonParameter;
3751
import org.tron.common.storage.rocksdb.RocksDbDataSourceImpl;
3852
import org.tron.common.utils.FileUtil;
3953
import org.tron.common.utils.PropUtil;
54+
import org.tron.common.utils.ReflectUtils;
4055
import org.tron.common.utils.StorageUtils;
4156
import org.tron.core.config.args.Args;
4257
import org.tron.core.exception.TronError;
@@ -122,4 +137,68 @@ private void makeExceptionDb(String dbName) {
122137
FileUtil.saveData(dataSource.getDbPath().toString() + "/CURRENT",
123138
"...", Boolean.FALSE);
124139
}
140+
141+
@Test
142+
public void slowOpen() throws IOException {
143+
Logger dbLogger = (Logger) LoggerFactory.getLogger("DB");
144+
ListAppender<ILoggingEvent> dbAppender = new ListAppender<>();
145+
dbAppender.start();
146+
dbLogger.addAppender(dbAppender);
147+
try {
148+
final File dbDir = temporaryFolder.newFolder();
149+
final Path dbPath = dbDir.toPath();
150+
final String manifest = "MANIFEST-000042";
151+
final String WATCHDOG_DB_NAME = "slow-open-db";
152+
Files.write(dbPath.resolve(manifest), new byte[1024 * 1024]);
153+
Files.write(dbPath.resolve("CURRENT"), (manifest + "\n").getBytes(StandardCharsets.UTF_8));
154+
155+
LevelDbDataSourceImpl ds = new LevelDbDataSourceImpl();
156+
ReflectUtils.setFieldValue(ds, "dataBaseName", WATCHDOG_DB_NAME);
157+
ReflectUtils.setFieldValue(ds, "parentPath", dbDir.getParent());
158+
long startNs = System.nanoTime() - TimeUnit.SECONDS.toNanos(61);
159+
AtomicReference<String> cache = new AtomicReference<>();
160+
ds.logSlowOpen(dbPath, startNs, cache);
161+
162+
List<ILoggingEvent> warns = dbAppender.list.stream()
163+
.filter(e -> e.getLevel() == Level.WARN)
164+
.collect(Collectors.toList());
165+
assertEquals("expected exactly one WARN event", 1, warns.size());
166+
ILoggingEvent warn = warns.get(0);
167+
assertNotNull("expected one WARN from the watchdog helper", warn);
168+
String rendered = warn.getFormattedMessage();
169+
assertTrue("WARN should mention the MANIFEST filename: " + rendered,
170+
rendered.contains(manifest));
171+
assertTrue("WARN should include MANIFEST size in MB: " + rendered,
172+
rendered.contains("(1.00 MB)"));
173+
assertTrue("WARN should include the Toolkit remediation hint: " + rendered,
174+
rendered.contains("Toolkit.jar db archive -d"));
175+
assertTrue("WARN should echo the db name: " + rendered,
176+
rendered.contains(WATCHDOG_DB_NAME));
177+
assertNotNull("cache should hold the resolved MANIFEST info", cache.get());
178+
} finally {
179+
dbAppender.stop();
180+
dbLogger.detachAppender(dbAppender);
181+
}
182+
}
183+
184+
@Test
185+
public void fastOpen() {
186+
Logger dbLogger = (Logger) LoggerFactory.getLogger("DB");
187+
ListAppender<ILoggingEvent> dbAppender = new ListAppender<>();
188+
dbAppender.start();
189+
dbLogger.addAppender(dbAppender);
190+
try {
191+
String dir = Args.getInstance().getOutputDirectory()
192+
+ Args.getInstance().getStorage().getDbDirectory();
193+
LevelDbDataSourceImpl ds = new LevelDbDataSourceImpl(dir, "test_fast_open");
194+
ds.closeDB();
195+
long warnCount = dbAppender.list.stream()
196+
.filter(e -> e.getLevel() == Level.WARN)
197+
.count();
198+
assertEquals("no WARN should fire for a fast open", 0, warnCount);
199+
} finally {
200+
dbAppender.stop();
201+
dbLogger.detachAppender(dbAppender);
202+
}
203+
}
125204
}

0 commit comments

Comments
 (0)