Skip to content

Commit e9afd32

Browse files
authored
[improvement](memory) Optimize the log of process memory insufficient and support regular GC cache (#16084)
1. When the process memory is insufficient, print the process memory statistics in a more timely and detailed manner. 2. Support regular GC cache, currently only page cache and chunk allocator are included, because many people reported that the memory does not drop after the query ends. 3. Reduce system available memory warning water mark to reduce memory waste 4. Optimize soft mem limit logging
1 parent 46ce66c commit e9afd32

File tree

6 files changed

+119
-55
lines changed

6 files changed

+119
-55
lines changed

be/src/common/config.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ CONF_Int64(max_sys_mem_available_low_water_mark_bytes, "1717986918");
7474
// The size of the memory that gc wants to release each time, as a percentage of the mem limit.
7575
CONF_mString(process_minor_gc_size, "10%");
7676
CONF_mString(process_full_gc_size, "20%");
77+
// Some caches have their own gc threads, such as segment cache.
78+
// For caches that do not have a separate gc thread, perform regular gc in the memory maintenance thread.
79+
// Currently only storage page cache, chunk allocator, more in the future.
80+
CONF_mInt32(cache_gc_interval_s, "60");
7781

7882
// If true, when the process does not exceed the soft mem limit, the query memory will not be limited;
7983
// when the process memory exceeds the soft mem limit, the query with the largest ratio between the currently
@@ -477,7 +481,7 @@ CONF_Bool(madvise_huge_pages, "false");
477481
CONF_Bool(mmap_buffers, "false");
478482

479483
// Sleep time in milliseconds between memory maintenance iterations
480-
CONF_mInt64(memory_maintenance_sleep_time_ms, "500");
484+
CONF_mInt32(memory_maintenance_sleep_time_ms, "500");
481485

482486
// Sleep time in milliseconds between load channel memory refresh iterations
483487
CONF_mInt64(load_channel_memory_refresh_sleep_time_ms, "100");

be/src/common/daemon.cpp

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,9 @@ void Daemon::tcmalloc_gc_thread() {
165165
}
166166

167167
void Daemon::memory_maintenance_thread() {
168-
int64_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
168+
int32_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
169+
int32_t cache_gc_interval_ms = config::cache_gc_interval_s * 1000;
170+
int64_t cache_gc_freed_mem = 0;
169171
while (!_stop_background_threads_latch.wait_for(
170172
std::chrono::milliseconds(interval_milliseconds))) {
171173
if (!MemInfo::initialized()) {
@@ -180,29 +182,49 @@ void Daemon::memory_maintenance_thread() {
180182
doris::MemInfo::refresh_allocator_mem();
181183
#endif
182184
doris::MemInfo::refresh_proc_mem_no_allocator_cache();
183-
LOG_EVERY_N(INFO, 10) << MemTrackerLimiter::process_mem_log_str();
184185

185186
// Refresh mem tracker each type metrics.
186187
doris::MemTrackerLimiter::refresh_global_counter();
187-
if (doris::config::memory_debug) {
188-
doris::MemTrackerLimiter::print_log_process_usage("memory_debug", false);
189-
}
190-
doris::MemTrackerLimiter::enable_print_log_process_usage();
191188

192189
// If system available memory is not enough, or the process memory exceeds the limit, reduce refresh interval.
193190
if (doris::MemInfo::sys_mem_available() <
194191
doris::MemInfo::sys_mem_available_low_water_mark() ||
195192
doris::MemInfo::proc_mem_no_allocator_cache() >= doris::MemInfo::mem_limit()) {
196-
interval_milliseconds = 100;
197-
doris::MemInfo::process_full_gc();
193+
doris::MemTrackerLimiter::print_log_process_usage("process full gc", false);
194+
interval_milliseconds = std::min(100, config::memory_maintenance_sleep_time_ms);
195+
if (doris::MemInfo::process_full_gc()) {
196+
// If there is not enough memory to be gc, the process memory usage will not be printed in the next continuous gc.
197+
doris::MemTrackerLimiter::enable_print_log_process_usage();
198+
}
199+
cache_gc_interval_ms = config::cache_gc_interval_s * 1000;
198200
} else if (doris::MemInfo::sys_mem_available() <
199201
doris::MemInfo::sys_mem_available_warning_water_mark() ||
200202
doris::MemInfo::proc_mem_no_allocator_cache() >=
201203
doris::MemInfo::soft_mem_limit()) {
202-
interval_milliseconds = 200;
203-
doris::MemInfo::process_minor_gc();
204+
doris::MemTrackerLimiter::print_log_process_usage("process minor gc", false);
205+
interval_milliseconds = std::min(200, config::memory_maintenance_sleep_time_ms);
206+
if (doris::MemInfo::process_minor_gc()) {
207+
doris::MemTrackerLimiter::enable_print_log_process_usage();
208+
}
209+
cache_gc_interval_ms = config::cache_gc_interval_s * 1000;
204210
} else {
211+
doris::MemTrackerLimiter::enable_print_log_process_usage();
205212
interval_milliseconds = config::memory_maintenance_sleep_time_ms;
213+
if (doris::config::memory_debug) {
214+
LOG_EVERY_N(WARNING, 20) << doris::MemTrackerLimiter::log_process_usage_str(
215+
"memory debug", false); // default 10s print once
216+
} else {
217+
LOG_EVERY_N(INFO, 10)
218+
<< MemTrackerLimiter::process_mem_log_str(); // default 5s print once
219+
}
220+
cache_gc_interval_ms -= interval_milliseconds;
221+
if (cache_gc_interval_ms < 0) {
222+
cache_gc_freed_mem = 0;
223+
doris::MemInfo::process_cache_gc(cache_gc_freed_mem);
224+
LOG(INFO) << fmt::format("Process regular GC Cache, Free Memory {} Bytes",
225+
cache_gc_freed_mem); // default 6s print once
226+
cache_gc_interval_ms = config::cache_gc_interval_s * 1000;
227+
}
206228
}
207229
}
208230
}

be/src/runtime/memory/mem_tracker_limiter.cpp

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <queue>
2424

2525
#include "runtime/fragment_mgr.h"
26+
#include "runtime/load_channel_mgr.h"
2627
#include "runtime/runtime_state.h"
2728
#include "runtime/thread_context.h"
2829
#include "util/pretty_printer.h"
@@ -195,26 +196,35 @@ void MemTrackerLimiter::print_log_usage(const std::string& msg) {
195196
}
196197
}
197198

199+
std::string MemTrackerLimiter::log_process_usage_str(const std::string& msg, bool with_stacktrace) {
200+
std::string detail = msg;
201+
detail += "\nProcess Memory Summary:\n " + MemTrackerLimiter::process_mem_log_str();
202+
if (with_stacktrace) detail += "\nAlloc Stacktrace:\n" + get_stack_trace();
203+
std::vector<MemTracker::Snapshot> snapshots;
204+
MemTrackerLimiter::make_process_snapshots(&snapshots);
205+
MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::GLOBAL);
206+
207+
// Add additional tracker printed when memory exceeds limit.
208+
snapshots.emplace_back(
209+
ExecEnv::GetInstance()->load_channel_mgr()->mem_tracker()->make_snapshot());
210+
211+
detail += "\nMemory Tracker Summary:";
212+
for (const auto& snapshot : snapshots) {
213+
if (snapshot.label == "" && snapshot.parent_label == "") {
214+
detail += "\n " + MemTrackerLimiter::type_log_usage(snapshot);
215+
} else if (snapshot.parent_label == "") {
216+
detail += "\n " + MemTrackerLimiter::log_usage(snapshot);
217+
} else {
218+
detail += "\n " + MemTracker::log_usage(snapshot);
219+
}
220+
}
221+
return detail;
222+
}
223+
198224
void MemTrackerLimiter::print_log_process_usage(const std::string& msg, bool with_stacktrace) {
199225
if (MemTrackerLimiter::_enable_print_log_process_usage) {
200226
MemTrackerLimiter::_enable_print_log_process_usage = false;
201-
std::string detail = msg;
202-
detail += "\nProcess Memory Summary:\n " + MemTrackerLimiter::process_mem_log_str();
203-
if (with_stacktrace) detail += "\nAlloc Stacktrace:\n" + get_stack_trace();
204-
std::vector<MemTracker::Snapshot> snapshots;
205-
MemTrackerLimiter::make_process_snapshots(&snapshots);
206-
MemTrackerLimiter::make_type_snapshots(&snapshots, MemTrackerLimiter::Type::GLOBAL);
207-
detail += "\nMemory Tracker Summary:";
208-
for (const auto& snapshot : snapshots) {
209-
if (snapshot.label == "" && snapshot.parent_label == "") {
210-
detail += "\n " + MemTrackerLimiter::type_log_usage(snapshot);
211-
} else if (snapshot.parent_label == "") {
212-
detail += "\n " + MemTrackerLimiter::log_usage(snapshot);
213-
} else {
214-
detail += "\n " + MemTracker::log_usage(snapshot);
215-
}
216-
}
217-
LOG(WARNING) << detail;
227+
LOG(WARNING) << log_process_usage_str(msg, with_stacktrace);
218228
}
219229
}
220230

@@ -252,6 +262,10 @@ int64_t MemTrackerLimiter::free_top_memory_query(int64_t min_free_mem, Type type
252262
int64_t freed_mem = 0;
253263
while (!min_pq.empty()) {
254264
TUniqueId cancelled_queryid = label_to_queryid(min_pq.top().second);
265+
if (cancelled_queryid == TUniqueId()) {
266+
min_pq.pop();
267+
continue;
268+
}
255269
ExecEnv::GetInstance()->fragment_mgr()->cancel_query(
256270
cancelled_queryid, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED,
257271
fmt::format("Process has no memory available, cancel top memory usage {}: "
@@ -344,6 +358,10 @@ int64_t MemTrackerLimiter::free_top_overcommit_query(int64_t min_free_mem, Type
344358
int64_t freed_mem = 0;
345359
while (!max_pq.empty()) {
346360
TUniqueId cancelled_queryid = label_to_queryid(max_pq.top().second);
361+
if (cancelled_queryid == TUniqueId()) {
362+
max_pq.pop();
363+
continue;
364+
}
347365
int64_t query_mem = query_consumption[max_pq.top().second];
348366
ExecEnv::GetInstance()->fragment_mgr()->cancel_query(
349367
cancelled_queryid, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED,
@@ -354,7 +372,7 @@ int64_t MemTrackerLimiter::free_top_overcommit_query(int64_t min_free_mem, Type
354372
"details see be.INFO.",
355373
TypeString[type], TypeString[type], max_pq.top().second,
356374
print_bytes(query_mem), BackendOptions::get_localhost(),
357-
PerfCounters::get_vm_rss_str(), print_bytes(MemInfo::soft_mem_limit()),
375+
PerfCounters::get_vm_rss_str(), MemInfo::soft_mem_limit_str(),
358376
MemInfo::sys_mem_available_str(),
359377
print_bytes(MemInfo::sys_mem_available_warning_water_mark())));
360378

be/src/runtime/memory/mem_tracker_limiter.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ class MemTrackerLimiter final : public MemTracker {
136136
void print_log_usage(const std::string& msg);
137137
void enable_print_log_usage() { _enable_print_log_usage = true; }
138138
static void enable_print_log_process_usage() { _enable_print_log_process_usage = true; }
139+
static std::string log_process_usage_str(const std::string& msg, bool with_stacktrace = true);
139140
static void print_log_process_usage(const std::string& msg, bool with_stacktrace = true);
140141

141142
// Log the memory usage when memory limit is exceeded.
@@ -157,6 +158,9 @@ class MemTrackerLimiter final : public MemTracker {
157158
}
158159
// only for Type::QUERY or Type::LOAD.
159160
static TUniqueId label_to_queryid(const std::string& label) {
161+
if (label.rfind("Query#Id=", 0) != 0 && label.rfind("Load#Id=", 0) != 0) {
162+
return TUniqueId();
163+
}
160164
auto queryid = split(label, "#Id=")[1];
161165
TUniqueId querytid;
162166
parse_id(queryid, &querytid);
@@ -165,12 +169,14 @@ class MemTrackerLimiter final : public MemTracker {
165169

166170
static std::string process_mem_log_str() {
167171
return fmt::format(
168-
"OS physical memory {}, process memory used {} limit {}, sys mem available {} low "
169-
"water mark {}, refresh interval memory growth {} B",
172+
"OS physical memory {}. Process memory usage {}, limit {}, soft limit {}. Sys "
173+
"available memory {}, low water mark {}, warning water mark {}. Refresh interval "
174+
"memory growth {} B",
170175
PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES),
171176
PerfCounters::get_vm_rss_str(), MemInfo::mem_limit_str(),
172-
MemInfo::sys_mem_available_str(),
177+
MemInfo::soft_mem_limit_str(), MemInfo::sys_mem_available_str(),
173178
PrettyPrinter::print(MemInfo::sys_mem_available_low_water_mark(), TUnit::BYTES),
179+
PrettyPrinter::print(MemInfo::sys_mem_available_warning_water_mark(), TUnit::BYTES),
174180
MemInfo::refresh_interval_memory_growth);
175181
}
176182

be/src/util/mem_info.cpp

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ int64_t MemInfo::_s_physical_mem = -1;
4747
int64_t MemInfo::_s_mem_limit = -1;
4848
std::string MemInfo::_s_mem_limit_str = "";
4949
int64_t MemInfo::_s_soft_mem_limit = -1;
50+
std::string MemInfo::_s_soft_mem_limit_str = "";
5051

5152
int64_t MemInfo::_s_allocator_cache_mem = 0;
5253
std::string MemInfo::_s_allocator_cache_mem_str = "";
@@ -89,61 +90,66 @@ void MemInfo::refresh_allocator_mem() {
8990
#endif
9091
}
9192

93+
void MemInfo::process_cache_gc(int64_t& freed_mem) {
94+
// TODO, free more cache, and should free a certain percentage of capacity, not all.
95+
freed_mem += ChunkAllocator::instance()->mem_consumption();
96+
ChunkAllocator::instance()->clear();
97+
freed_mem +=
98+
StoragePageCache::instance()->get_page_cache_mem_consumption(segment_v2::DATA_PAGE);
99+
StoragePageCache::instance()->prune(segment_v2::DATA_PAGE);
100+
}
101+
92102
// step1: free all cache
93103
// step2: free top overcommit query, if enable query memroy overcommit
94-
void MemInfo::process_minor_gc() {
95-
// TODO, free more cache, and should free a certain percentage of capacity, not all.
104+
bool MemInfo::process_minor_gc() {
96105
int64_t freed_mem = 0;
97106
Defer defer {[&]() {
98107
LOG(INFO) << fmt::format("Process Minor GC Free Memory {} Bytes", freed_mem);
99108
}};
100109

101-
freed_mem += ChunkAllocator::instance()->mem_consumption();
102-
ChunkAllocator::instance()->clear();
110+
MemInfo::process_cache_gc(freed_mem);
103111
if (freed_mem > _s_process_minor_gc_size) {
104-
return;
112+
return true;
105113
}
106-
freed_mem +=
107-
StoragePageCache::instance()->get_page_cache_mem_consumption(segment_v2::DATA_PAGE);
108-
StoragePageCache::instance()->prune(segment_v2::DATA_PAGE);
109114
if (config::enable_query_memroy_overcommit) {
110115
freed_mem +=
111116
MemTrackerLimiter::free_top_overcommit_query(_s_process_minor_gc_size - freed_mem);
112117
}
118+
if (freed_mem > _s_process_minor_gc_size) {
119+
return true;
120+
}
121+
return false;
113122
}
114123

115124
// step1: free all cache
116125
// step2: free top memory query
117126
// step3: free top overcommit load, load retries are more expensive, So cancel at the end.
118127
// step4: free top memory load
119-
void MemInfo::process_full_gc() {
128+
bool MemInfo::process_full_gc() {
120129
int64_t freed_mem = 0;
121130
Defer defer {
122131
[&]() { LOG(INFO) << fmt::format("Process Full GC Free Memory {} Bytes", freed_mem); }};
123132

124-
freed_mem +=
125-
StoragePageCache::instance()->get_page_cache_mem_consumption(segment_v2::DATA_PAGE);
126-
StoragePageCache::instance()->prune(segment_v2::DATA_PAGE);
133+
MemInfo::process_cache_gc(freed_mem);
127134
if (freed_mem > _s_process_full_gc_size) {
128-
return;
129-
}
130-
freed_mem += ChunkAllocator::instance()->mem_consumption();
131-
ChunkAllocator::instance()->clear();
132-
if (freed_mem > _s_process_full_gc_size) {
133-
return;
135+
return true;
134136
}
135137
freed_mem += MemTrackerLimiter::free_top_memory_query(_s_process_full_gc_size - freed_mem);
136138
if (freed_mem > _s_process_full_gc_size) {
137-
return;
139+
return true;
138140
}
139141
if (config::enable_query_memroy_overcommit) {
140142
freed_mem +=
141143
MemTrackerLimiter::free_top_overcommit_load(_s_process_full_gc_size - freed_mem);
142144
if (freed_mem > _s_process_full_gc_size) {
143-
return;
145+
return true;
144146
}
145147
}
146148
freed_mem += MemTrackerLimiter::free_top_memory_load(_s_process_full_gc_size - freed_mem);
149+
if (freed_mem > _s_process_full_gc_size) {
150+
return true;
151+
}
152+
return false;
147153
}
148154

149155
#ifndef __APPLE__
@@ -203,6 +209,7 @@ void MemInfo::init() {
203209
}
204210
_s_mem_limit_str = PrettyPrinter::print(_s_mem_limit, TUnit::BYTES);
205211
_s_soft_mem_limit = _s_mem_limit * config::soft_mem_limit_frac;
212+
_s_soft_mem_limit_str = PrettyPrinter::print(_s_soft_mem_limit, TUnit::BYTES);
206213

207214
_s_process_minor_gc_size =
208215
ParseUtil::parse_mem_spec(config::process_minor_gc_size, -1, _s_mem_limit, &is_percent);
@@ -239,7 +246,7 @@ void MemInfo::init() {
239246
config::max_sys_mem_available_low_water_mark_bytes);
240247
int64_t p2 = std::max<int64_t>(_s_vm_min_free_kbytes - _s_physical_mem * 0.01, 0);
241248
_s_sys_mem_available_low_water_mark = std::max<int64_t>(p1 - p2, 0);
242-
_s_sys_mem_available_warning_water_mark = _s_sys_mem_available_low_water_mark + p1 * 2;
249+
_s_sys_mem_available_warning_water_mark = _s_sys_mem_available_low_water_mark + p1;
243250

244251
LOG(INFO) << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES)
245252
<< ", Mem Limit: " << _s_mem_limit_str
@@ -264,6 +271,7 @@ void MemInfo::init() {
264271
_s_mem_limit = ParseUtil::parse_mem_spec(config::mem_limit, -1, _s_physical_mem, &is_percent);
265272
_s_mem_limit_str = PrettyPrinter::print(_s_mem_limit, TUnit::BYTES);
266273
_s_soft_mem_limit = _s_mem_limit * config::soft_mem_limit_frac;
274+
_s_soft_mem_limit_str = PrettyPrinter::print(_s_soft_mem_limit, TUnit::BYTES);
267275

268276
LOG(INFO) << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES);
269277
_s_initialized = true;

be/src/util/mem_info.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,16 @@ class MemInfo {
111111
DCHECK(_s_initialized);
112112
return _s_soft_mem_limit;
113113
}
114+
static inline std::string soft_mem_limit_str() {
115+
DCHECK(_s_initialized);
116+
return _s_soft_mem_limit_str;
117+
}
114118

115119
static std::string debug_string();
116120

117-
static void process_minor_gc();
118-
static void process_full_gc();
121+
static void process_cache_gc(int64_t& freed_mem);
122+
static bool process_minor_gc();
123+
static bool process_full_gc();
119124

120125
// It is only used after the memory limit is exceeded. When multiple threads are waiting for the available memory of the process,
121126
// avoid multiple threads starting at the same time and causing OOM.
@@ -127,6 +132,7 @@ class MemInfo {
127132
static int64_t _s_mem_limit;
128133
static std::string _s_mem_limit_str;
129134
static int64_t _s_soft_mem_limit;
135+
static std::string _s_soft_mem_limit_str;
130136

131137
static int64_t _s_allocator_cache_mem;
132138
static std::string _s_allocator_cache_mem_str;

0 commit comments

Comments
 (0)