Skip to content

Commit 2c7efc0

Browse files
committed
8381834: Improve TLAB sizing heuristics
Reviewed-by: jsikstro, tschatzl, aboldtch
1 parent 7df417c commit 2c7efc0

3 files changed

Lines changed: 166 additions & 112 deletions

File tree

src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp

Lines changed: 124 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "runtime/perfData.hpp"
3636
#include "runtime/threadSMR.hpp"
3737
#include "utilities/copy.hpp"
38+
#include "utilities/integerCast.hpp"
3839

3940
size_t ThreadLocalAllocBuffer::_max_size = 0;
4041
unsigned int ThreadLocalAllocBuffer::_target_num_refills = 0;
@@ -75,43 +76,54 @@ size_t ThreadLocalAllocBuffer::remaining() {
7576
}
7677

7778
void ThreadLocalAllocBuffer::accumulate_and_reset_statistics(ThreadLocalAllocStats* stats) {
78-
size_t capacity = Universe::heap()->tlab_capacity();
79-
size_t used = Universe::heap()->tlab_used();
80-
8179
_gc_waste += (unsigned)remaining();
82-
uint64_t total_allocated = thread()->allocated_bytes();
83-
uint64_t allocated_since_last_gc = total_allocated - _allocated_before_last_gc;
84-
_allocated_before_last_gc = total_allocated;
85-
86-
print_stats("gc");
87-
88-
if (_num_refills > 0) {
89-
// Update allocation history if a reasonable amount of eden was allocated.
90-
bool update_allocation_history = used > 0.5 * capacity;
91-
92-
if (update_allocation_history) {
93-
// Average the fraction of eden allocated in a tlab by this
94-
// thread for use in the next resize operation.
95-
// _gc_waste is not subtracted because it's included in
96-
// "used".
97-
// The result can be larger than 1.0 due to direct to old allocations.
98-
// These allocations should ideally not be counted but since it is not possible
99-
// to filter them out here we just cap the fraction to be at most 1.0.
100-
// Keep alloc_frac as float and not double to avoid the double to float conversion
101-
float alloc_frac = MIN2(1.0f, allocated_since_last_gc / (float) used);
102-
_allocation_fraction.sample(alloc_frac);
80+
const uint64_t allocated_bytes = thread()->allocated_bytes();
81+
82+
const size_t allocated_since_last_gc = integer_cast_permit_tautology<size_t>(allocated_bytes - _allocated_before_last_gc);
83+
_allocated_before_last_gc = allocated_bytes;
84+
85+
if (allocated_since_last_gc > 0) {
86+
const size_t tlab_capacity = Universe::heap()->tlab_capacity();
87+
const size_t tlab_used = Universe::heap()->tlab_used();
88+
if (tlab_used > 0.5 * tlab_capacity) {
89+
// To avoid divide-by-zero
90+
const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
91+
const float alloc_frac = (float)allocated_since_last_gc / effective_tlab_capacity;
92+
_allocation_fraction.sample(MIN2(alloc_frac, 1.0f));
10393
}
104-
105-
stats->update_fast_allocations(_num_refills,
106-
_allocated_size,
107-
_gc_waste,
108-
_refill_waste);
94+
stats->update_current_thread_stats(_num_refills,
95+
allocated_since_last_gc,
96+
_allocated_size,
97+
_gc_waste,
98+
_refill_waste,
99+
_num_slow_allocations);
109100
} else {
110-
assert(_num_refills == 0 && _refill_waste == 0 && _gc_waste == 0,
101+
assert(_num_refills == 0 && _refill_waste == 0
102+
&& _gc_waste == 0 && _num_slow_allocations == 0,
111103
"tlab stats == 0");
112104
}
113105

114-
stats->update_num_slow_allocations(_num_slow_allocations);
106+
{
107+
Log(gc, tlab) log;
108+
if (log.is_trace()) {
109+
Thread* thrd = thread();
110+
size_t waste = _gc_waste + _refill_waste;
111+
double waste_percent = percent_of(waste, _allocated_size);
112+
log.trace("TLAB GC: thread: " PTR_FORMAT " [id: %2d]"
113+
" desired: %zuK"
114+
" allocated: %zuK"
115+
" slow allocs: %d refill waste: %zuB"
116+
" refills: %d waste %4.1f%% gc: %dB"
117+
" slow: %dB",
118+
p2i(thrd), thrd->osthread()->thread_id(),
119+
_desired_size*HeapWordSize/K,
120+
allocated_since_last_gc/K,
121+
_num_slow_allocations, _refill_waste_limit * HeapWordSize,
122+
_num_refills, waste_percent,
123+
_gc_waste * HeapWordSize,
124+
_refill_waste * HeapWordSize);
125+
}
126+
}
115127

116128
reset_statistics();
117129
}
@@ -147,20 +159,27 @@ void ThreadLocalAllocBuffer::record_refill_waste() {
147159
}
148160

149161
void ThreadLocalAllocBuffer::resize() {
150-
// Compute the next tlab size using expected allocation amount
151162
assert(ResizeTLAB, "Should not call this otherwise");
152-
size_t alloc = (size_t)(_allocation_fraction.average() *
153-
(Universe::heap()->tlab_capacity() / HeapWordSize));
163+
size_t capacity_in_words = Universe::heap()->tlab_capacity() / HeapWordSize;
164+
float alloc_fraction = _allocation_fraction.average();
165+
if (alloc_fraction == 0.0) {
166+
// No samples, use global alloc fraction as an approximation.
167+
const float total_frac = ThreadLocalAllocStats::total_requested_size_fraction_avg();
168+
const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
169+
alloc_fraction = total_frac / num_threads;
170+
}
171+
size_t alloc = (size_t)(alloc_fraction * capacity_in_words);
154172
size_t new_size = alloc / _target_num_refills;
155173

156174
new_size = clamp(new_size, min_size(), max_size());
157175

158176
size_t aligned_new_size = align_object_size(new_size);
159177

160-
log_trace(gc, tlab)("TLAB new size: thread: " PTR_FORMAT " [id: %2d]"
161-
" refills %d alloc: %8.6f desired_size: %zu -> %zu",
178+
log_trace(gc, tlab)("TLAB resize: thread: " PTR_FORMAT " [id: %2d]"
179+
" alloc-fraction: %.3f desired_size: %zuK -> %zuK",
162180
p2i(thread()), thread()->osthread()->thread_id(),
163-
_target_num_refills, _allocation_fraction.average(), desired_size(), aligned_new_size);
181+
alloc_fraction,
182+
desired_size() * HeapWordSize/K, aligned_new_size * HeapWordSize/K);
164183

165184
set_desired_size(aligned_new_size);
166185
set_refill_waste_limit(initial_refill_waste_limit());
@@ -179,11 +198,24 @@ void ThreadLocalAllocBuffer::fill(HeapWord* start,
179198
size_t new_size) {
180199
_num_refills++;
181200
_allocated_size += new_size;
182-
print_stats("fill");
201+
183202
assert(top <= start + new_size - alignment_reserve(), "size too small");
184203

185204
initialize(start, top, start + new_size - alignment_reserve());
186-
205+
{
206+
Log(gc, tlab) log;
207+
if (log.is_trace()) {
208+
Thread* thrd = thread();
209+
log.trace("TLAB fill: thread: " PTR_FORMAT " [id: %2d]"
210+
" capacity: %zuK"
211+
" slow allocs: %d "
212+
" refills: %d",
213+
p2i(thrd), thrd->osthread()->thread_id(),
214+
pointer_delta(_end, _start, sizeof(char)) / K,
215+
_num_slow_allocations,
216+
_num_refills);
217+
}
218+
}
187219
// Reset amount of internal fragmentation
188220
set_refill_waste_limit(initial_refill_waste_limit());
189221
}
@@ -206,13 +238,6 @@ void ThreadLocalAllocBuffer::initialize() {
206238

207239
set_desired_size(initial_desired_size());
208240

209-
size_t capacity = Universe::heap()->tlab_capacity() / HeapWordSize;
210-
if (capacity > 0) {
211-
// Keep alloc_frac as float and not double to avoid the double to float conversion
212-
float alloc_frac = desired_size() * target_num_refills() / (float)capacity;
213-
_allocation_fraction.sample(alloc_frac);
214-
}
215-
216241
set_refill_waste_limit(initial_refill_waste_limit());
217242

218243
reset_statistics();
@@ -243,11 +268,11 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
243268
if (TLABSize > 0) {
244269
init_sz = TLABSize / HeapWordSize;
245270
} else {
246-
// Initial size is a function of the average number of allocating threads.
247-
unsigned int num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
248-
249-
init_sz = (Universe::heap()->tlab_capacity() / HeapWordSize) /
250-
(num_threads * target_num_refills());
271+
const size_t predicted_total_requested_size = (size_t)(ThreadLocalAllocStats::total_requested_size_fraction_avg() * Universe::heap()->tlab_capacity());
272+
const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
273+
const size_t per_thread_requested_size = predicted_total_requested_size / num_threads;
274+
const size_t tlab_size = per_thread_requested_size / _target_num_refills;
275+
init_sz = tlab_size / HeapWordSize;
251276
init_sz = align_object_size(init_sz);
252277
}
253278
// We can't use clamp() between min_size() and max_size() here because some
@@ -258,32 +283,7 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
258283
return init_sz;
259284
}
260285

261-
void ThreadLocalAllocBuffer::print_stats(const char* tag) {
262-
Log(gc, tlab) log;
263-
if (!log.is_trace()) {
264-
return;
265-
}
266-
267-
Thread* thrd = thread();
268-
size_t waste = _gc_waste + _refill_waste;
269-
double waste_percent = percent_of(waste, _allocated_size);
270-
size_t tlab_used = Universe::heap()->tlab_used();
271-
log.trace("TLAB: %s thread: " PTR_FORMAT " [id: %2d]"
272-
" desired_size: %zuKB"
273-
" slow allocs: %d refill waste: %zuB"
274-
" alloc:%8.5f %8.0fKB refills: %d waste %4.1f%% gc: %dB"
275-
" slow: %dB",
276-
tag, p2i(thrd), thrd->osthread()->thread_id(),
277-
_desired_size / (K / HeapWordSize),
278-
_num_slow_allocations, _refill_waste_limit * HeapWordSize,
279-
_allocation_fraction.average(),
280-
_allocation_fraction.average() * tlab_used / K,
281-
_num_refills, waste_percent,
282-
_gc_waste * HeapWordSize,
283-
_refill_waste * HeapWordSize);
284-
}
285-
286-
Thread* ThreadLocalAllocBuffer::thread() {
286+
Thread* ThreadLocalAllocBuffer::thread() const {
287287
return (Thread*)(((char*)this) + in_bytes(start_offset()) - in_bytes(Thread::tlab_start_offset()));
288288
}
289289

@@ -314,6 +314,7 @@ PerfVariable* ThreadLocalAllocStats::_perf_max_refill_waste;
314314
PerfVariable* ThreadLocalAllocStats::_perf_total_num_slow_allocations;
315315
PerfVariable* ThreadLocalAllocStats::_perf_max_num_slow_allocations;
316316
AdaptiveWeightedAverage ThreadLocalAllocStats::_num_allocating_threads_avg(0);
317+
AdaptiveWeightedAverage ThreadLocalAllocStats::_total_requested_size_fraction(0);
317318

318319
static PerfVariable* create_perf_variable(const char* name, PerfData::Units unit, TRAPS) {
319320
ResourceMark rm;
@@ -324,6 +325,9 @@ void ThreadLocalAllocStats::initialize() {
324325
_num_allocating_threads_avg = AdaptiveWeightedAverage(TLABAllocationWeight);
325326
_num_allocating_threads_avg.sample(1); // One allocating thread at startup
326327

328+
_total_requested_size_fraction = AdaptiveWeightedAverage(TLABAllocationWeight);
329+
_total_requested_size_fraction.sample(0.10f); // 10%
330+
327331
if (UsePerfData) {
328332
EXCEPTION_MARK;
329333
_perf_num_allocating_threads = create_perf_variable("allocThreads", PerfData::U_None, CHECK);
@@ -344,6 +348,7 @@ ThreadLocalAllocStats::ThreadLocalAllocStats() :
344348
_total_num_refills(0),
345349
_max_num_refills(0),
346350
_total_allocated_size(0),
351+
_total_requested_bytes(0),
347352
_total_gc_waste(0),
348353
_max_gc_waste(0),
349354
_total_refill_waste(0),
@@ -355,21 +360,25 @@ unsigned int ThreadLocalAllocStats::num_allocating_threads_avg() {
355360
return MAX2((unsigned int)(_num_allocating_threads_avg.average() + 0.5), 1U);
356361
}
357362

358-
void ThreadLocalAllocStats::update_fast_allocations(unsigned int num_refills,
359-
size_t allocated_size,
360-
size_t gc_waste,
361-
size_t refill_waste) {
362-
_num_allocating_threads += 1;
363-
_total_num_refills += num_refills;
364-
_max_num_refills = MAX2(_max_num_refills, num_refills);
365-
_total_allocated_size += allocated_size;
366-
_total_gc_waste += gc_waste;
367-
_max_gc_waste = MAX2(_max_gc_waste, gc_waste);
368-
_total_refill_waste += refill_waste;
369-
_max_refill_waste = MAX2(_max_refill_waste, refill_waste);
363+
float ThreadLocalAllocStats::total_requested_size_fraction_avg() {
364+
return _total_requested_size_fraction.average();
370365
}
371366

372-
void ThreadLocalAllocStats::update_num_slow_allocations(unsigned int num_slow_allocations) {
367+
void ThreadLocalAllocStats::update_current_thread_stats(unsigned int num_refills,
368+
size_t requested_bytes,
369+
size_t alloc_size_for_tlab,
370+
size_t gc_waste,
371+
size_t refill_waste,
372+
unsigned int num_slow_allocations) {
373+
_num_allocating_threads += 1;
374+
_total_num_refills += num_refills;
375+
_max_num_refills = MAX2(_max_num_refills, num_refills);
376+
_total_allocated_size += alloc_size_for_tlab;
377+
_total_requested_bytes += requested_bytes;
378+
_total_gc_waste += gc_waste;
379+
_max_gc_waste = MAX2(_max_gc_waste, gc_waste);
380+
_total_refill_waste += refill_waste;
381+
_max_refill_waste = MAX2(_max_refill_waste, refill_waste);
373382
_total_num_slow_allocations += num_slow_allocations;
374383
_max_num_slow_allocations = MAX2(_max_num_slow_allocations, num_slow_allocations);
375384
}
@@ -379,6 +388,7 @@ void ThreadLocalAllocStats::update(const ThreadLocalAllocStats& other) {
379388
_total_num_refills += other._total_num_refills;
380389
_max_num_refills = MAX2(_max_num_refills, other._max_num_refills);
381390
_total_allocated_size += other._total_allocated_size;
391+
_total_requested_bytes += other._total_requested_bytes;
382392
_total_gc_waste += other._total_gc_waste;
383393
_max_gc_waste = MAX2(_max_gc_waste, other._max_gc_waste);
384394
_total_refill_waste += other._total_refill_waste;
@@ -392,6 +402,7 @@ void ThreadLocalAllocStats::reset() {
392402
_total_num_refills = 0;
393403
_max_num_refills = 0;
394404
_total_allocated_size = 0;
405+
_total_requested_bytes = 0;
395406
_total_gc_waste = 0;
396407
_max_gc_waste = 0;
397408
_total_refill_waste = 0;
@@ -401,22 +412,37 @@ void ThreadLocalAllocStats::reset() {
401412
}
402413

403414
void ThreadLocalAllocStats::publish() {
404-
if (_total_allocated_size == 0) {
415+
if (_total_requested_bytes == 0) {
405416
return;
406417
}
407418

408419
_num_allocating_threads_avg.sample(_num_allocating_threads);
409420

421+
{
422+
const size_t tlab_capacity = Universe::heap()->tlab_capacity();
423+
const size_t tlab_used = Universe::heap()->tlab_used();
424+
if (tlab_used > 0.5 * tlab_capacity) {
425+
// To avoid divide-by-zero
426+
const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
427+
const float requested_size_fraction = (float)_total_requested_bytes / effective_tlab_capacity;
428+
_total_requested_size_fraction.sample(MIN2(requested_size_fraction, 1.0f));
429+
}
430+
}
431+
410432
const size_t waste = _total_gc_waste + _total_refill_waste;
411433
const double waste_percent = percent_of(waste, _total_allocated_size);
412-
log_debug(gc, tlab)("TLAB totals: thrds: %d refills: %d max: %d"
413-
" slow allocs: %d max %d waste: %4.1f%%"
414-
" gc: %zuB max: %zuB"
415-
" slow: %zuB max: %zuB",
416-
_num_allocating_threads, _total_num_refills, _max_num_refills,
434+
435+
const double gc_waste_pct = percent_of(_total_gc_waste, _total_allocated_size);
436+
const double refill_waste_pct = percent_of(_total_refill_waste, _total_allocated_size);
437+
438+
log_debug(gc, tlab)("TLAB totals: thrds: %d alloc-frac: %.1f%% refills: %d max: %d"
439+
" slow allocs: %d max %d waste: %.1f%%"
440+
" gc: %zuB(%.1f%%) max: %zuB"
441+
" refill: %zuB(%.1f%%) max: %zuB",
442+
_num_allocating_threads, _total_requested_size_fraction.average() * 100, _total_num_refills, _max_num_refills,
417443
_total_num_slow_allocations, _max_num_slow_allocations, waste_percent,
418-
_total_gc_waste * HeapWordSize, _max_gc_waste * HeapWordSize,
419-
_total_refill_waste * HeapWordSize, _max_refill_waste * HeapWordSize);
444+
_total_gc_waste * HeapWordSize, gc_waste_pct, _max_gc_waste * HeapWordSize,
445+
_total_refill_waste * HeapWordSize, refill_waste_pct, _max_refill_waste * HeapWordSize);
420446

421447
if (UsePerfData) {
422448
_perf_num_allocating_threads ->set_value(_num_allocating_threads);

0 commit comments

Comments
 (0)