Skip to content

Commit 6277a20

Browse files
committed
feat: enable memory profiling
1 parent 0324be0 commit 6277a20

11 files changed

Lines changed: 598 additions & 6 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ set(mi_sources
7272
src/options.c
7373
src/os.c
7474
src/page.c
75+
src/profile.c
7576
src/random.c
7677
src/segment.c
7778
src/segment-map.c
@@ -746,7 +747,7 @@ if (MI_BUILD_TESTS)
746747
enable_testing()
747748

748749
# static link tests
749-
foreach(TEST_NAME api api-fill stress)
750+
foreach(TEST_NAME api api-fill stress profile)
750751
add_executable(mimalloc-test-${TEST_NAME} test/test-${TEST_NAME}.c)
751752
target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines})
752753
target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags})

include/mimalloc/internal.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,5 +1162,43 @@ static inline void _mi_memzero_aligned(void* dst, size_t n) {
11621162
}
11631163
#endif
11641164

1165+
// ------------------------------------------------------------------
1166+
// Heap profiler fast-path hooks (profile.c / profile.h)
1167+
//
1168+
// Inlined here so that the common case (profiling disabled or no
1169+
// sample due) adds only a handful of instructions at each call site.
1170+
// The slow paths are noinline and defined in profile.c.
1171+
//
1172+
// Thread safety: see profile.h for the acquire/release protocol on
1173+
// _mi_profiler.enabled.
1174+
// ------------------------------------------------------------------
1175+
#include "mimalloc/profile.h"
1176+
1177+
static inline void _mi_profiler_on_alloc(mi_heap_t* heap, mi_page_t* page, void* ptr, size_t size) {
1178+
// Relaxed load: we only need to know whether to do any work. The acquire
1179+
// that synchronizes on_alloc/on_free/record_extra_bytes visibility is in
1180+
// the slow path, which is the first place we actually read those fields.
1181+
if mi_likely(!mi_atomic_load_relaxed(&_mi_profiler.enabled)) return;
1182+
mi_profiler_tld_t* ptld = &heap->tld->profiler;
1183+
if (ptld->in_profiler) return;
1184+
ptld->bytes_since_sample += size; // unsigned: wraps at 2^64 bytes (~18 EB), harmless in practice
1185+
if mi_likely(ptld->bytes_since_sample < ptld->next_threshold) return;
1186+
_mi_profiler_on_alloc_slow(heap, page, ptr, size);
1187+
}
1188+
1189+
static inline void _mi_profiler_on_free_local(mi_page_t* page, void* ptr) {
1190+
// No acquire load on enabled: has_metadata is only ever set by the owning thread
1191+
// after it observed enabled=true via an acquire load in _mi_profiler_on_alloc.
1192+
// If has_metadata is true, all profiler fields are already visible to this thread.
1193+
if mi_likely(!page->has_metadata) return;
1194+
_mi_profiler_on_free_local_slow(page, ptr);
1195+
}
1196+
1197+
static inline void _mi_profiler_on_free_collected(mi_page_t* page, mi_block_t* head) {
1198+
// Same argument as _mi_profiler_on_free_local.
1199+
if mi_likely(!page->has_metadata) return;
1200+
_mi_profiler_on_free_collected_slow(page, head);
1201+
}
1202+
11651203

11661204
#endif

include/mimalloc/profile.h

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/* ----------------------------------------------------------------------------
2+
Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
3+
This is free software; you can redistribute it and/or modify it under the
4+
terms of the MIT license. A copy of the license can be found in the file
5+
"LICENSE" at the root of this distribution.
6+
-----------------------------------------------------------------------------*/
7+
#pragma once
8+
#ifndef MIMALLOC_PROFILE_H
9+
#define MIMALLOC_PROFILE_H
10+
11+
#include <stddef.h>
12+
#include <stdint.h>
13+
#include <stdbool.h>
14+
#include "mimalloc/atomic.h"
15+
16+
// Forward declarations — full types come from types.h / internal.h.
17+
typedef struct mi_page_s mi_page_t;
18+
typedef struct mi_block_s mi_block_t;
19+
typedef struct mi_heap_s mi_heap_t;
20+
21+
// ------------------------------------------------------------------
22+
// Allocation record: one node per sampled live allocation, stored in
23+
// a singly-linked list at page->metadata. Opaque to callers; the
24+
// callbacks receive user_data directly.
25+
//
26+
// `ptr` — the sampled user pointer; used internally to match frees.
27+
// `user_data` — flexible array member for caller-owned metadata.
28+
// The number of bytes available is
29+
// _mi_profiler.record_extra_bytes, set at
30+
// mi_profiler_enable() time. Typical uses: a captured
31+
// stack trace, allocation size and weight for on_free,
32+
// a pointer to an external profiler node, or a timestamp.
33+
// The profiler does not initialize this region and never
34+
// reads it.
35+
// Alignment: user_data begins at offset sizeof(mi_alloc_record_t)
36+
// from the allocation base (2 * sizeof(void*): 16 bytes on 64-bit,
37+
// 8 bytes on 32-bit), so it is suitably aligned for any scalar or
38+
// pointer type. SIMD types requiring > 16-byte alignment are not
39+
// guaranteed to be aligned.
40+
// ------------------------------------------------------------------
41+
typedef struct mi_alloc_record_s {
42+
void* ptr;
43+
struct mi_alloc_record_s* next;
44+
char user_data[]; // length = _mi_profiler.record_extra_bytes
45+
} mi_alloc_record_t;
46+
47+
// ------------------------------------------------------------------
48+
// User-supplied callbacks.
49+
//
50+
// on_alloc: called when a sample is taken.
51+
// `user_data` — caller-owned region (record_extra_bytes bytes);
52+
// may write anything here for use in on_free.
53+
// NULL if record_extra_bytes is 0.
54+
// `ptr` — the sampled user pointer.
55+
// `requested_size` — size passed by the caller to malloc/calloc/etc.
56+
// `usable_size` — actual usable bytes after size-class rounding;
57+
// reflects true memory consumption.
58+
// `threshold` — the threshold (bytes) that triggered this sample.
59+
// `bytes_since_last_sample` — bytes accumulated since the last sample; the
60+
// statistical weight of this sample.
61+
// `heap_tag` — tag of the heap that made the allocation, set via
62+
// mi_heap_new_ex(). Zero for the default heap.
63+
// Returns the number of bytes to accumulate before the next sample.
64+
// Returning 0 causes the next allocation to be sampled immediately.
65+
//
66+
// on_free: called when a sampled allocation is freed.
67+
// `user_data` — the same region written during on_alloc. Valid only
68+
// for the duration of the callback; do not retain the pointer.
69+
// `ptr` — the freed user pointer.
70+
// May be NULL if free-time notification is not needed.
71+
// ------------------------------------------------------------------
72+
typedef size_t (*mi_profiler_alloc_cb)(void* user_data, void* ptr, size_t requested_size, size_t usable_size, size_t threshold, size_t bytes_since_last_sample, uint8_t heap_tag);
73+
typedef void (*mi_profiler_free_cb)(void* user_data, void* ptr);
74+
75+
// ------------------------------------------------------------------
76+
// Global profiler configuration.
77+
//
78+
// Profiling is one-way: once enabled it cannot be disabled.
79+
//
80+
// `enabled` is _Atomic(bool) so that mi_profiler_enable() can be called
81+
// from any thread. The store uses release order; reads in the inline
82+
// fast-path hooks (in internal.h) use relaxed order (sufficient to decide
83+
// whether to do any work); the slow path uses acquire order to ensure
84+
// on_alloc, on_free, and record_extra_bytes are visible before they are read.
85+
// ------------------------------------------------------------------
86+
typedef struct mi_profiler_s {
87+
_Atomic(bool) enabled;
88+
mi_profiler_alloc_cb on_alloc; // non-NULL when enabled=true
89+
mi_profiler_free_cb on_free; // may be NULL
90+
size_t record_extra_bytes; // bytes allocated after each mi_alloc_record_t for user_data
91+
} mi_profiler_t;
92+
93+
extern mi_profiler_t _mi_profiler;
94+
95+
// ------------------------------------------------------------------
96+
// Public API — must be called at most once. May be called from any
97+
// thread, before or after other threads have started. Each thread
98+
// samples its first allocation immediately; the on_alloc callback
99+
// controls all subsequent thresholds.
100+
//
101+
// Returns true on success, false if:
102+
// - profiling was already enabled (called more than once), or
103+
// - on_alloc is NULL, or
104+
// - on_free is NULL but record_extra_bytes > 0, or
105+
// - record_extra_bytes would overflow the record allocation size.
106+
// ------------------------------------------------------------------
107+
bool mi_profiler_enable(size_t record_extra_bytes, mi_profiler_alloc_cb on_alloc, mi_profiler_free_cb on_free);
108+
109+
// ------------------------------------------------------------------
110+
// Slow-path implementations (defined in profile.c).
111+
// The inline fast-path wrappers are in internal.h so they have
112+
// access to the full type definitions they need.
113+
// ------------------------------------------------------------------
114+
void _mi_profiler_on_alloc_slow(mi_heap_t* heap, mi_page_t* page, void* ptr, size_t size);
115+
void _mi_profiler_on_free_local_slow(mi_page_t* page, void* ptr);
116+
void _mi_profiler_on_free_collected_slow(mi_page_t* page, mi_block_t* head);
117+
118+
#endif // MIMALLOC_PROFILE_H

include/mimalloc/types.h

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,9 @@ typedef uintptr_t mi_thread_free_t;
317317
// at least one block that will be added, or as already been added, to
318318
// the owning heap `thread_delayed_free` list. This guarantees that pages
319319
// will be freed correctly even if only other threads free blocks.
320+
// Forward declaration for the profiler record list stored in page->metadata.
321+
struct mi_alloc_record_s;
322+
320323
typedef struct mi_page_s {
321324
// "owned" by the segment
322325
uint32_t slice_count; // slices in this page (0 if not a page)
@@ -330,7 +333,9 @@ typedef struct mi_page_s {
330333
uint16_t reserved; // number of blocks reserved in memory
331334
mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits)
332335
uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized
333-
uint8_t retire_expire:7; // expiration count for retired blocks
336+
uint8_t has_metadata:1; // `true` if page->metadata is non-NULL; on the same cache line as
337+
// the hot free fields to avoid a cache miss on every deallocation
338+
uint8_t retire_expire:6; // expiration count for retired blocks (max value is MI_RETIRE_CYCLES=16, so 6 bits suffices)
334339

335340
mi_block_t* free; // list of available free blocks (`malloc` allocates from this list)
336341
mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`)
@@ -352,7 +357,12 @@ typedef struct mi_page_s {
352357
struct mi_page_s* prev; // previous page owned by this thread with the same `block_size`
353358

354359
// 64-bit 11 words, 32-bit 13 words, (+2 for secure)
355-
void* padding[1];
360+
// `metadata` is the head of a singly-linked list of profiler allocation
361+
// records (mi_alloc_record_t, defined in profile.h). It is zeroed when
362+
// the page is cleared (segment.c:mi_segment_page_clear) and must be
363+
// re-initialized when the page is reused. NULL when profiling is off or
364+
// no sampled allocations are live on this page.
365+
struct mi_alloc_record_s* metadata;
356366
} mi_page_t;
357367

358368

@@ -624,13 +634,21 @@ typedef struct mi_segments_tld_s {
624634
} mi_segments_tld_t;
625635

626636
// Thread local data
637+
// Per-thread profiler state (see include/mimalloc/profile.h)
638+
typedef struct mi_profiler_tld_s {
639+
size_t bytes_since_sample; // bytes allocated since the last sample
640+
size_t next_threshold; // sample when bytes_since_sample reaches this value
641+
bool in_profiler; // reentrancy guard: skip profiling inside profiler code
642+
} mi_profiler_tld_t;
643+
627644
struct mi_tld_s {
628645
unsigned long long heartbeat; // monotonic heartbeat count
629646
bool recurse; // true if deferred was called; used to prevent infinite recursion.
630647
mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted)
631648
mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates)
632649
mi_segments_tld_t segments; // segment tld
633650
mi_stats_t stats; // statistics
651+
mi_profiler_tld_t profiler; // heap profiler state
634652
};
635653

636654

src/alloc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
111111
#endif
112112
#endif
113113

114+
_mi_profiler_on_alloc(heap, page, block, size - MI_PADDING_SIZE);
114115
return block;
115116
}
116117

@@ -303,6 +304,10 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero,
303304
// mi_track_resize(p,size,newsize)
304305
// if (newsize < size) { mi_track_mem_noaccess((uint8_t*)p + newsize, size - newsize); }
305306
if (usable_post!=NULL) { *usable_post = mi_page_usable_block_size(page); }
307+
// TODO(profiler): if p has a live profiler record, notify the profiler of the
308+
// resize so it can update the recorded size. The allocate-and-copy path below
309+
// is handled correctly because it goes through mi_free + mi_heap_umalloc which
310+
// hit the existing on_free and on_alloc hooks.
306311
return p; // reallocation still fits and not more than 50% waste
307312
}
308313
void* newp = mi_heap_umalloc(heap,newsize,usable_post);

src/free.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
3434
// checks
3535
if mi_unlikely(mi_check_is_double_free(page, block)) return;
3636
if (!was_guarded) { mi_check_padding(page, block); }
37+
// Profiler hook fires before any page state (local_free, used) is modified,
38+
// so page is fully consistent. mi_record_free may call mi_free internally,
39+
// which is safe because in_profiler suppresses recursion and the page is
40+
// in a valid pre-free state.
41+
_mi_profiler_on_free_local(page, block);
3742
if (track_stats) { mi_stat_free(page, block); }
3843
#if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
3944
if (!mi_page_is_huge(page)) { // huge page content may be already decommitted

src/init.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ const mi_page_t _mi_page_empty = {
2020
0, // reserved capacity
2121
{ 0 }, // flags
2222
false, // is_zero
23+
false, // has_metadata
2324
0, // retire_expire
2425
NULL, // free
2526
NULL, // local_free
@@ -34,7 +35,7 @@ const mi_page_t _mi_page_empty = {
3435
MI_ATOMIC_VAR_INIT(0), // xthread_free
3536
MI_ATOMIC_VAR_INIT(0), // xheap
3637
NULL, NULL
37-
, { 0 } // padding
38+
, NULL // metadata
3839
};
3940

4041
#define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -140,7 +141,8 @@ mi_decl_cache_align static const mi_tld_t tld_empty = {
140141
false,
141142
NULL, NULL,
142143
{ MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, tld_empty_stats }, // segments
143-
{ sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL } // stats
144+
{ sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL }, // stats
145+
{ 0, 0, false } // profiler
144146
};
145147

146148
mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
@@ -156,7 +158,8 @@ static mi_decl_cache_align mi_tld_t tld_main = {
156158
0, false,
157159
&_mi_heap_main, & _mi_heap_main,
158160
{ MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, &tld_main.stats }, // segments
159-
{ sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL } // stats
161+
{ sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL }, // stats
162+
{ 0, 0, false } // profiler
160163
};
161164

162165
mi_decl_cache_align mi_heap_t _mi_heap_main = {

src/page.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
211211
return; // the thread-free items cannot be freed
212212
}
213213

214+
_mi_profiler_on_free_collected(page, head);
215+
214216
// and append the current local free list
215217
mi_block_set_next(page,tail, page->local_free);
216218
page->local_free = head;

0 commit comments

Comments
 (0)