Skip to content

Commit 1a41581

Browse files
committed
Optimize noisy kernel logs
1 parent 186c0a9 commit 1a41581

6 files changed

Lines changed: 126 additions & 26 deletions

File tree

iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporter.java

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
import java.util.concurrent.Future;
7575
import java.util.concurrent.ScheduledExecutorService;
7676
import java.util.concurrent.TimeUnit;
77+
import java.util.concurrent.atomic.AtomicLong;
7778

7879
import static org.apache.iotdb.db.utils.EncodingInferenceUtils.getDefaultEncoding;
7980

@@ -82,11 +83,14 @@ public class IoTDBInternalLocalReporter extends IoTDBInternalReporter {
8283
private static final Logger LOGGER = LoggerFactory.getLogger(IoTDBInternalLocalReporter.class);
8384
private static final SessionManager SESSION_MANAGER = SessionManager.getInstance();
8485
private static final Coordinator COORDINATOR = Coordinator.getInstance();
86+
private static final long METRIC_UPDATE_FAILURE_LOG_INTERVAL_MS = TimeUnit.MINUTES.toMillis(1);
8587
private final SessionInfo sessionInfo;
8688
private final IPartitionFetcher partitionFetcher;
8789
private final ISchemaFetcher schemaFetcher;
8890
private Future<?> currentServiceFuture;
8991
private final ScheduledExecutorService service = Executors.newSingleThreadScheduledExecutor();
92+
private final AtomicLong lastMetricUpdateFailureLogTime = new AtomicLong(0);
93+
private final AtomicLong suppressedMetricUpdateFailureLogCount = new AtomicLong(0);
9094

9195
public IoTDBInternalLocalReporter() {
9296
partitionFetcher = ClusterPartitionFetcher.getInstance();
@@ -174,18 +178,60 @@ protected void writeMetricToIoTDB(Map<String, Object> valueMap, String prefix, l
174178
result = insertRecord(valueMap, prefix, time);
175179
}
176180
if (result.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
177-
LOGGER.warn(DataNodeMiscMessages.FAILED_UPDATE_METRIC_VALUE, result);
181+
logFailedUpdateMetricValue(result);
178182
}
179183
} catch (IoTDBConnectionException e1) {
180-
LOGGER.warn(
181-
"Failed to update the value of metric because of connection failure, because ", e1);
184+
logMetricUpdateConnectionFailure(e1);
182185
} catch (IllegalPathException | QueryProcessException e2) {
183-
LOGGER.warn(
184-
"Failed to update the value of metric because of internal error, because ", e2);
186+
logMetricUpdateInternalFailure(e2);
185187
}
186188
});
187189
}
188190

191+
private long getMetricUpdateFailureSuppressedCountIfShouldLog() {
192+
long now = System.currentTimeMillis();
193+
long lastLogTime = lastMetricUpdateFailureLogTime.get();
194+
if (now - lastLogTime >= METRIC_UPDATE_FAILURE_LOG_INTERVAL_MS
195+
&& lastMetricUpdateFailureLogTime.compareAndSet(lastLogTime, now)) {
196+
return suppressedMetricUpdateFailureLogCount.getAndSet(0);
197+
}
198+
suppressedMetricUpdateFailureLogCount.incrementAndGet();
199+
return -1;
200+
}
201+
202+
private void logFailedUpdateMetricValue(TSStatus result) {
203+
long suppressedCount = getMetricUpdateFailureSuppressedCountIfShouldLog();
204+
if (suppressedCount < 0) {
205+
return;
206+
}
207+
LOGGER.warn(
208+
DataNodeMiscMessages.FAILED_UPDATE_METRIC_VALUE + ", suppressedSimilarLogs={}",
209+
result,
210+
suppressedCount);
211+
}
212+
213+
private void logMetricUpdateConnectionFailure(IoTDBConnectionException e) {
214+
long suppressedCount = getMetricUpdateFailureSuppressedCountIfShouldLog();
215+
if (suppressedCount < 0) {
216+
return;
217+
}
218+
LOGGER.warn(
219+
"Failed to update the value of metric because of connection failure, suppressedSimilarLogs={}",
220+
suppressedCount,
221+
e);
222+
}
223+
224+
private void logMetricUpdateInternalFailure(Exception e) {
225+
long suppressedCount = getMetricUpdateFailureSuppressedCountIfShouldLog();
226+
if (suppressedCount < 0) {
227+
return;
228+
}
229+
LOGGER.warn(
230+
"Failed to update the value of metric because of internal error, suppressedSimilarLogs={}",
231+
suppressedCount,
232+
e);
233+
}
234+
189235
private TSStatus insertRecord(Map<String, Object> valueMap, String prefix, long time)
190236
throws IoTDBConnectionException, QueryProcessException, IllegalPathException {
191237
TSInsertRecordReq request = new TSInsertRecordReq();
@@ -242,7 +288,14 @@ private void createTimeSeries(Map<String, Object> valueMap, String prefix)
242288
COORDINATOR.executeForTreeModel(
243289
s, queryId, sessionInfo, "", partitionFetcher, schemaFetcher);
244290
if (result.status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
245-
LOGGER.warn(DataNodeMiscMessages.FAILED_AUTO_CREATE_TIMESERIES, paths, result.status);
291+
long suppressedCount = getMetricUpdateFailureSuppressedCountIfShouldLog();
292+
if (suppressedCount >= 0) {
293+
LOGGER.warn(
294+
DataNodeMiscMessages.FAILED_AUTO_CREATE_TIMESERIES + ", suppressedSimilarLogs={}",
295+
paths,
296+
result.status,
297+
suppressedCount);
298+
}
246299
}
247300
}
248301

iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3777,7 +3777,7 @@ public int executeTTLCheck() throws InterruptedException {
37773777
if (skipCurrentTTLAndModificationCheck()) {
37783778
return 0;
37793779
}
3780-
logger.info(
3780+
logger.debug(
37813781
"[TTL] {}-{} Start ttl and modification checking.", databaseName, dataRegionIdString);
37823782
CompactionScheduleContext context =
37833783
new CompactionScheduleContext(
@@ -3800,12 +3800,19 @@ public int executeTTLCheck() throws InterruptedException {
38003800
if (context.hasSubmitTask()) {
38013801
CompactionMetrics.getInstance().updateCompactionTaskSelectionNum(context);
38023802
}
3803-
logger.info(
3804-
"[TTL] {}-{} Totally select {} all-outdated files and {} partial-outdated files.",
3805-
databaseName,
3806-
dataRegionIdString,
3807-
context.getFullyDirtyFileNum(),
3808-
context.getPartiallyDirtyFileNum());
3803+
if (context.hasSubmitTask()) {
3804+
logger.info(
3805+
"[TTL] {}-{} Totally select {} all-outdated files and {} partial-outdated files.",
3806+
databaseName,
3807+
dataRegionIdString,
3808+
context.getFullyDirtyFileNum(),
3809+
context.getPartiallyDirtyFileNum());
3810+
} else {
3811+
logger.debug(
3812+
"[TTL] {}-{} No all-outdated or partial-outdated files are selected.",
3813+
databaseName,
3814+
dataRegionIdString);
3815+
}
38093816
} catch (InterruptedException e) {
38103817
throw e;
38113818
} catch (Throwable e) {

iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/schedule/CompactionScheduleTaskWorker.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,16 @@ public Void call() {
7474
} catch (InterruptedException ignored) {
7575
boolean isStoppedByUser =
7676
CompactionScheduleTaskManager.getInstance().isStoppingAllScheduleTask();
77-
logger.info(
78-
"[CompactionScheduleTaskWorker-{}] compaction schedule is interrupted, isStopByUser: {}",
79-
workerId,
80-
isStoppedByUser);
8177
if (isStoppedByUser) {
78+
logger.debug(
79+
"[CompactionScheduleTaskWorker-{}] compaction schedule is interrupted by stop signal",
80+
workerId);
8281
return null;
8382
}
83+
logger.info(
84+
"[CompactionScheduleTaskWorker-{}] compaction schedule is interrupted, isStopByUser: {}",
85+
workerId,
86+
false);
8487
} catch (Exception e) {
8588
logger.error(
8689
"[CompactionScheduleTaskWorker-{}] Failed to execute compaction schedule task",

iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/compaction/schedule/TTLScheduleTask.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,12 @@ public Void call() throws Exception {
7070
} catch (StopTTLCheckException | InterruptedException ignored) {
7171
boolean isStoppedByUser =
7272
CompactionScheduleTaskManager.getInstance().isStoppingAllScheduleTask();
73-
logger.info(
74-
"[TTLCheckTask-{}] TTL checker is interrupted, isStoppedByUser: {}",
75-
workerId,
76-
isStoppedByUser);
7773
if (isStoppedByUser) {
74+
logger.debug("[TTLCheckTask-{}] TTL checker is interrupted by stop signal", workerId);
7875
return null;
7976
}
77+
logger.info(
78+
"[TTLCheckTask-{}] TTL checker is interrupted, isStoppedByUser: {}", workerId, false);
8079
} catch (Exception e) {
8180
logger.error(StorageEngineMessages.TTL_CHECK_TASK_FAILED, workerId, e);
8281
} catch (Throwable t) {

iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManager.java

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@
6060
public class WALManager implements IService {
6161
private static final Logger logger = LoggerFactory.getLogger(WALManager.class);
6262
private static final IoTDBConfig config = IoTDBDescriptor.getInstance().getConfig();
63+
private static final long WAL_DISK_USAGE_OVER_THROTTLE_LOG_INTERVAL_MS =
64+
TimeUnit.MINUTES.toMillis(1);
6365

6466
// manage all wal nodes and decide how to allocate them
6567
private final NodeAllocationStrategy walNodesManager;
@@ -69,6 +71,8 @@ public class WALManager implements IService {
6971
private final AtomicLong totalDiskUsage = new AtomicLong();
7072
// total number of wal files
7173
private final AtomicLong totalFileNum = new AtomicLong();
74+
private final AtomicLong lastWalDiskUsageOverThrottleLogTime = new AtomicLong(0);
75+
private final AtomicLong suppressedWalDiskUsageOverThrottleLogCount = new AtomicLong(0);
7276

7377
private WALManager() {
7478
if (config.getDataRegionConsensusProtocolClass().equals(ConsensusFactory.IOT_CONSENSUS)
@@ -184,10 +188,7 @@ private void deleteOutdatedFiles() {
184188
while ((firstLoop || shouldThrottle())) {
185189
deleteOutdatedFilesInWALNodes();
186190
if (firstLoop && shouldThrottle()) {
187-
logger.warn(
188-
"WAL disk usage {} is larger than the wal_throttle_threshold_in_byte * 0.8 {}, please check your write load, iot consensus and the pipe module. It's better to allocate more disk for WAL.",
189-
getTotalDiskUsage(),
190-
getThrottleThreshold());
191+
logWalDiskUsageOverThrottle();
191192
}
192193
firstLoop = false;
193194
if (Thread.interrupted()) {
@@ -197,6 +198,24 @@ private void deleteOutdatedFiles() {
197198
}
198199
}
199200

201+
private void logWalDiskUsageOverThrottle() {
202+
long now = System.currentTimeMillis();
203+
long lastLogTime = lastWalDiskUsageOverThrottleLogTime.get();
204+
if (now - lastLogTime >= WAL_DISK_USAGE_OVER_THROTTLE_LOG_INTERVAL_MS
205+
&& lastWalDiskUsageOverThrottleLogTime.compareAndSet(lastLogTime, now)) {
206+
long suppressedCount = suppressedWalDiskUsageOverThrottleLogCount.getAndSet(0);
207+
logger.warn(
208+
"WAL disk usage {} is larger than the wal_throttle_threshold_in_byte * 0.8 {}, "
209+
+ "please check your write load, iot consensus and the pipe module. "
210+
+ "It's better to allocate more disk for WAL. suppressedSimilarLogs={}",
211+
getTotalDiskUsage(),
212+
getThrottleThreshold(),
213+
suppressedCount);
214+
} else {
215+
suppressedWalDiskUsageOverThrottleLogCount.incrementAndGet();
216+
}
217+
}
218+
200219
protected void deleteOutdatedFilesInWALNodes() {
201220
if (config.getWalMode() == WALMode.DISABLE) {
202221
return;

iotdb-core/datanode/src/main/java/org/apache/iotdb/db/trigger/service/TriggerInformationUpdater.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,13 @@
4040
import java.util.concurrent.Future;
4141
import java.util.concurrent.ScheduledExecutorService;
4242
import java.util.concurrent.TimeUnit;
43+
import java.util.concurrent.atomic.AtomicLong;
4344
import java.util.stream.Collectors;
4445

4546
public class TriggerInformationUpdater {
4647

4748
private static final Logger LOGGER = LoggerFactory.getLogger(TriggerInformationUpdater.class);
49+
private static final long UPDATE_FAILURE_LOG_INTERVAL_MS = TimeUnit.MINUTES.toMillis(1);
4850

4951
private static final IClientManager<ConfigRegionId, ConfigNodeClient> CONFIG_NODE_CLIENT_MANAGER =
5052
ConfigNodeClientManager.getInstance();
@@ -54,6 +56,8 @@ public class TriggerInformationUpdater {
5456
ThreadName.STATEFUL_TRIGGER_INFORMATION_UPDATER.getName());
5557

5658
private Future<?> updateFuture;
59+
private final AtomicLong lastUpdateFailureLogTime = new AtomicLong(0);
60+
private final AtomicLong suppressedUpdateFailureLogCount = new AtomicLong(0);
5761

5862
private static final long UPDATE_INTERVAL = 1000L * 60;
5963

@@ -98,7 +102,22 @@ public void updateTask() {
98102
triggerInformation.getTriggerName(), triggerInformation.getDataNodeLocation());
99103
}
100104
} catch (Exception e) {
101-
LOGGER.warn(DataNodeMiscMessages.ERROR_UPDATING_TRIGGER_INFO, e);
105+
logUpdateFailure(e);
106+
}
107+
}
108+
109+
private void logUpdateFailure(Exception e) {
110+
long now = System.currentTimeMillis();
111+
long lastLogTime = lastUpdateFailureLogTime.get();
112+
if (now - lastLogTime >= UPDATE_FAILURE_LOG_INTERVAL_MS
113+
&& lastUpdateFailureLogTime.compareAndSet(lastLogTime, now)) {
114+
long suppressedCount = suppressedUpdateFailureLogCount.getAndSet(0);
115+
LOGGER.warn(
116+
DataNodeMiscMessages.ERROR_UPDATING_TRIGGER_INFO + " suppressedSimilarLogs={}",
117+
suppressedCount,
118+
e);
119+
} else {
120+
suppressedUpdateFailureLogCount.incrementAndGet();
102121
}
103122
}
104123
}

0 commit comments

Comments
 (0)