Skip to content

Commit 2c307c8

Browse files
authored
feat(log): improve log for gRPC, DB, and shutdown (#6700)
* feat(log): bridge JUL to SLF4J and improve shutdown log reliability ### What Changed - gRPC now appear in the application log instead of going only to standard error. - Database size stats are no longer printed at info level during startup, which reduces log noise. - Shutdown now waits longer before stopping the log system, giving pending messages more time to flush. - Async appender now allows up to 5 seconds to drain its queue on shutdown. - Interrupted shutdowns now show a clearer log message. - Test log config is INFO level. ### Impact `✅ Fewer missed gRPC error logs` `✅ Cleaner startup output` `✅ Fewer lost shutdown logs` `✅ Fewer test logs` * docs(log): clarify resetJUL vs handler-removal responsibilities * refactor(log): clean up TronLogShutdownHook constants, timing, and javadoc * fix(log): fail fast on unreadable --log-config instead of silent fallback * feat(log): slow signature verifications above 50ms * feat(log): watchdog-warn when LevelDB open stalls * test(log): pin TronLogShutdownHook wait-window semantics * style(log): clarify LogService comment and tighten test local var casing * refactor(log): drop manifest probe from LevelDB open watchdog * feat(log): route grpc.* logs to dedicated grpc.log * fix(log): make grpc logger asynchronous * chore(log): register framework and plugin logger topics * chore(log): raise slow signature verification to warn
1 parent 5ef7de6 commit 2c307c8

16 files changed

Lines changed: 580 additions & 49 deletions

File tree

build.gradle

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,9 @@ subprojects {
8181
}
8282

8383
dependencies {
84-
implementation group: 'org.slf4j', name: 'slf4j-api', version: '1.7.25'
85-
implementation group: 'org.slf4j', name: 'jcl-over-slf4j', version: '1.7.25'
84+
implementation group: 'org.slf4j', name: 'slf4j-api', version: '1.7.36'
85+
implementation group: 'org.slf4j', name: 'jcl-over-slf4j', version: '1.7.36'
86+
implementation group: 'org.slf4j', name: 'jul-to-slf4j', version: '1.7.36'
8687
implementation group: 'ch.qos.logback', name: 'logback-classic', version: '1.2.13'
8788
implementation "com.google.code.findbugs:jsr305:3.0.0"
8889
implementation group: 'org.springframework', name: 'spring-context', version: "${springVersion}"

chainbase/src/main/java/org/tron/common/storage/leveldb/LevelDbDataSourceImpl.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
import java.util.Map;
3333
import java.util.Map.Entry;
3434
import java.util.Set;
35+
import java.util.concurrent.ScheduledExecutorService;
36+
import java.util.concurrent.ScheduledFuture;
37+
import java.util.concurrent.TimeUnit;
3538
import java.util.concurrent.locks.ReadWriteLock;
3639
import java.util.concurrent.locks.ReentrantReadWriteLock;
3740
import java.util.stream.Collectors;
@@ -45,6 +48,7 @@
4548
import org.iq80.leveldb.ReadOptions;
4649
import org.iq80.leveldb.WriteBatch;
4750
import org.iq80.leveldb.WriteOptions;
51+
import org.tron.common.es.ExecutorServiceManager;
4852
import org.tron.common.parameter.CommonParameter;
4953
import org.tron.common.storage.WriteOptionsWrapper;
5054
import org.tron.common.storage.metric.DbStat;
@@ -61,6 +65,11 @@
6165
public class LevelDbDataSourceImpl extends DbStat implements DbSourceInter<byte[]>,
6266
Iterable<Entry<byte[], byte[]>>, Instance<LevelDbDataSourceImpl> {
6367

68+
/** First watchdog WARN fires this many seconds after factory.open() begins. */
69+
private static final long OPEN_WATCHDOG_INITIAL_DELAY_SEC = 60;
70+
/** Subsequent watchdog WARN lines are emitted on this interval. */
71+
private static final long OPEN_WATCHDOG_PERIOD_SEC = 30;
72+
6473
private String dataBaseName;
6574
private DB database;
6675
private volatile boolean alive;
@@ -121,6 +130,14 @@ private void openDatabase(Options dbOptions) throws IOException {
121130
if (!Files.isSymbolicLink(dbPath.getParent())) {
122131
Files.createDirectories(dbPath.getParent());
123132
}
133+
final long openStartNs = System.nanoTime();
134+
ScheduledExecutorService watchdog = ExecutorServiceManager
135+
.newSingleThreadScheduledExecutor("db-open-watchdog-" + dataBaseName, true);
136+
ScheduledFuture<?> watchdogTask = watchdog.scheduleAtFixedRate(
137+
() -> logSlowOpen(dbPath, openStartNs),
138+
OPEN_WATCHDOG_INITIAL_DELAY_SEC,
139+
OPEN_WATCHDOG_PERIOD_SEC,
140+
TimeUnit.SECONDS);
124141
try {
125142
DbSourceInter.checkOrInitEngine(getEngine(), dbPath.toString(),
126143
TronError.ErrCode.LEVELDB_INIT);
@@ -139,6 +156,28 @@ private void openDatabase(Options dbOptions) throws IOException {
139156
logger.error("Open Database {} failed", dataBaseName, e);
140157
}
141158
throw new TronError(e, TronError.ErrCode.LEVELDB_INIT);
159+
} finally {
160+
watchdogTask.cancel(false);
161+
watchdog.shutdownNow();
162+
}
163+
}
164+
165+
/**
166+
* Emits a WARN when factory.open() is still blocked — usually because the
167+
* MANIFEST has grown large enough to make replay expensive.
168+
*/
169+
void logSlowOpen(Path dbPath, long startNs) {
170+
try {
171+
long elapsedSec = TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - startNs);
172+
logger.warn("DB {} open still in progress after {}s. path={}. "
173+
+ "This startup will complete; to speed up future restarts, run "
174+
+ "`java -jar Toolkit.jar db archive -d {}` before the next startup "
175+
+ "to rebuild the MANIFEST (the tool requires an exclusive DB lock, "
176+
+ "so it cannot run while the node is up).",
177+
dataBaseName, elapsedSec, dbPath, parentPath);
178+
} catch (Exception e) {
179+
// Purely observational - never let the watchdog disrupt startup.
180+
logger.debug("db-open-watchdog failure for {}: {}", dataBaseName, e.getMessage());
142181
}
143182
}
144183

chainbase/src/main/java/org/tron/common/storage/metric/DbStat.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ protected void statProperty() {
1717
double size = Double.parseDouble(tmp[2]) * 1048576.0;
1818
Metrics.gaugeSet(MetricKeys.Gauge.DB_SST_LEVEL, files, getEngine(), getName(), level);
1919
Metrics.gaugeSet(MetricKeys.Gauge.DB_SIZE_BYTES, size, getEngine(), getName(), level);
20-
logger.info("DB {}, level:{},files:{},size:{} M",
20+
logger.debug("DB {}, level:{},files:{},size:{} M",
2121
getName(), level, files, size / 1048576.0);
2222
});
2323
} catch (Exception e) {

chainbase/src/main/java/org/tron/core/capsule/TransactionCapsule.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import java.util.List;
3636
import java.util.concurrent.ExecutorService;
3737
import java.util.concurrent.Future;
38+
import java.util.concurrent.TimeUnit;
3839
import java.util.concurrent.atomic.AtomicInteger;
3940
import lombok.Getter;
4041
import lombok.Setter;
@@ -94,6 +95,8 @@ public class TransactionCapsule implements ProtoCapsule<Transaction> {
9495
.newFixedThreadPool(esName, CommonParameter.getInstance()
9596
.getValidContractProtoThreadNum());
9697
private static final String OWNER_ADDRESS = "ownerAddress_";
98+
// 2-6 ms in general, so we set 50 ms as the threshold for slow signature verification.
99+
private static final long SLOW_SIG_VERIFY_MS = 50;
97100

98101
private Transaction transaction;
99102
@Setter
@@ -648,6 +651,7 @@ public boolean validatePubSignature(AccountStore accountStore,
648651

649652
byte[] hash = getTransactionId().getBytes();
650653

654+
long startNs = System.nanoTime();
651655
try {
652656
if (!validateSignature(this.transaction, hash, accountStore, dynamicPropertiesStore)) {
653657
isVerified = false;
@@ -656,12 +660,27 @@ public boolean validatePubSignature(AccountStore accountStore,
656660
} catch (SignatureException | PermissionException | SignatureFormatException e) {
657661
isVerified = false;
658662
throw new ValidateSignatureException(e.getMessage());
663+
} finally {
664+
logSlowSigVerify(startNs);
659665
}
660666
isVerified = true;
661667
}
662668
return true;
663669
}
664670

671+
/**
672+
* WARN-logs when a single signature verification exceeds
673+
* {@link #SLOW_SIG_VERIFY_MS}. Package-private so it can be exercised from
674+
* tests without forcing a real slow crypto path.
675+
*/
676+
void logSlowSigVerify(long startNs) {
677+
long costMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNs);
678+
if (costMs > SLOW_SIG_VERIFY_MS) {
679+
logger.warn("slow verify: txId={}, sigCount={}, cost={} ms",
680+
getTransactionId(), this.transaction.getSignatureCount(), costMs);
681+
}
682+
}
683+
665684
/**
666685
* validate signature
667686
*/

common/src/main/java/org/tron/common/log/LogService.java

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import ch.qos.logback.classic.LoggerContext;
44
import ch.qos.logback.classic.joran.JoranConfigurator;
5+
import ch.qos.logback.classic.jul.LevelChangePropagator;
6+
import ch.qos.logback.classic.spi.LoggerContextListener;
57
import ch.qos.logback.core.util.StatusPrinter;
68
import java.io.File;
79
import org.slf4j.LoggerFactory;
@@ -12,18 +14,43 @@ public class LogService {
1214
public static void load(String path) {
1315
LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory();
1416
try {
15-
File file = new File(path);
16-
if (!file.exists() || !file.isFile() || !file.canRead()) {
17-
return;
17+
// Fail fast rather than silently falling back to the classpath default —
18+
// that legacy behavior misled operators into thinking their custom
19+
// --log-config was active.
20+
if (path != null && !path.isEmpty()) {
21+
File file = new File(path);
22+
if (!file.exists() || !file.isFile() || !file.canRead()) {
23+
throw new IllegalArgumentException(
24+
"logback config is not a readable file: " + path);
25+
}
26+
JoranConfigurator configurator = new JoranConfigurator();
27+
configurator.setContext(lc);
28+
lc.reset();
29+
configurator.doConfigure(file);
1830
}
19-
JoranConfigurator configurator = new JoranConfigurator();
20-
configurator.setContext(lc);
21-
lc.reset();
22-
configurator.doConfigure(file);
31+
// Whether we loaded a custom config via --log-config or kept the classpath
32+
// default, make sure Logback level changes are propagated back to JUL so
33+
// gRPC loggers actually honor the levels declared in the XML. If
34+
// the active config already registered a LevelChangePropagator we leave
35+
// it alone.
36+
ensureLevelChangePropagator(lc);
2337
} catch (Exception e) {
2438
throw new TronError(e, TronError.ErrCode.LOG_LOAD);
2539
} finally {
2640
StatusPrinter.printInCaseOfErrorsOrWarnings(lc);
2741
}
2842
}
43+
44+
private static void ensureLevelChangePropagator(LoggerContext lc) {
45+
for (LoggerContextListener listener : lc.getCopyOfListenerList()) {
46+
if (listener instanceof LevelChangePropagator) {
47+
return;
48+
}
49+
}
50+
LevelChangePropagator propagator = new LevelChangePropagator();
51+
propagator.setContext(lc);
52+
propagator.setResetJUL(true);
53+
propagator.start();
54+
lc.addListener(propagator);
55+
}
2956
}

common/src/main/java/org/tron/common/parameter/CommonParameter.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import java.util.Set;
99
import lombok.Getter;
1010
import lombok.Setter;
11+
import org.slf4j.bridge.SLF4JBridgeHandler;
1112
import org.tron.common.args.GenesisBlock;
1213
import org.tron.common.config.DbBackupConfig;
1314
import org.tron.common.cron.CronExpression;
@@ -23,6 +24,19 @@
2324

2425
public class CommonParameter {
2526

27+
// Install the JUL->SLF4J bridge early so that JUL log records emitted during
28+
// static init of grpc classes (or from unit tests that don't invoke
29+
// LogService.load()) still reach Logback.
30+
// removeHandlersForRootLogger() strips JUL's default ConsoleHandler so the
31+
// same record is not emitted twice (once by JUL's own console output and
32+
// once via the bridge to Logback).
33+
static {
34+
SLF4JBridgeHandler.removeHandlersForRootLogger();
35+
if (!SLF4JBridgeHandler.isInstalled()) {
36+
SLF4JBridgeHandler.install();
37+
}
38+
}
39+
2640
protected static CommonParameter PARAMETER = new CommonParameter();
2741

2842
// Runtime chain state: set by VMConfig.initVmHardFork()

framework/src/main/java/org/tron/core/config/TronLogShutdownHook.java

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import ch.qos.logback.core.hook.ShutdownHookBase;
44
import ch.qos.logback.core.util.Duration;
5-
import org.tron.program.FullNode;
65

76
/**
87
* @author kiven
@@ -16,11 +15,24 @@ public class TronLogShutdownHook extends ShutdownHookBase {
1615
private static final Duration CHECK_SHUTDOWN_DELAY = Duration.buildByMilliseconds(100);
1716

1817
/**
19-
* The check times before shutdown. default is 60000/100 = 600 times.
18+
* Maximum time to wait for a graceful application shutdown before forcing
19+
* a log flush. Each pool managed by ExecutorServiceManager.shutdownAndAwait-
20+
* Termination() can take up to 120 s in the worst case (60 s await +
21+
* shutdownNow + 60 s await). 180 s is therefore not a hard upper bound, but
22+
* a pragmatic headroom that assumes the many pools in the node shut down
23+
* largely in parallel; in pathological cases trailing shutdown logs may
24+
* still be truncated. In practice 180 s of shutdown output is also enough
25+
* to diagnose most stalls — if a pool is still alive past that window the
26+
* earlier logs already carry the stack/trace context needed to locate the
27+
* offender, so truncating the tail is an acceptable trade-off against
28+
* holding JVM exit open indefinitely.
2029
*/
21-
private final long check_times = 60 * 1000 / CHECK_SHUTDOWN_DELAY.getMilliseconds();
30+
private static final long MAX_WAIT_MS = 3 * 60 * 1000;
2231

23-
// if true, shutdown hook will be executed , for example, 'java -jar FullNode.jar -[v|h]'.
32+
private static final long CHECK_TIMES =
33+
MAX_WAIT_MS / CHECK_SHUTDOWN_DELAY.getMilliseconds();
34+
35+
// if true, shutdown hook will be executed, for example, 'java -jar FullNode.jar -[v|h]'.
2436
public static volatile boolean shutDown = true;
2537

2638
public TronLogShutdownHook() {
@@ -29,16 +41,19 @@ public TronLogShutdownHook() {
2941
@Override
3042
public void run() {
3143
try {
32-
for (int i = 0; i < check_times; i++) {
44+
for (long i = 0; i < CHECK_TIMES; i++) {
3345
if (shutDown) {
3446
break;
3547
}
36-
addInfo("Sleeping for " + CHECK_SHUTDOWN_DELAY);
48+
if (i % 100 == 0) {
49+
long elapsedSeconds = i * CHECK_SHUTDOWN_DELAY.getMilliseconds() / 1000;
50+
addInfo("Waiting for application shutdown... elapsed=" + elapsedSeconds + "s");
51+
}
3752
Thread.sleep(CHECK_SHUTDOWN_DELAY.getMilliseconds());
3853
}
3954
} catch (InterruptedException e) {
4055
Thread.currentThread().interrupt();
41-
addInfo("TronLogShutdownHook run error :" + e.getMessage());
56+
addInfo("TronLogShutdownHook interrupted: " + e.getMessage());
4257
}
4358
super.stop();
4459
}

0 commit comments

Comments
 (0)