-
Notifications
You must be signed in to change notification settings - Fork 242
Expand file tree
/
Copy pathdeferred-free.c
More file actions
565 lines (513 loc) · 20.6 KB
/
deferred-free.c
File metadata and controls
565 lines (513 loc) · 20.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
/*
* Deferred-free queue for temporal overlap of syscall allocations.
*
* Sanitise callbacks allocate structs/buffers that post callbacks would
* normally free immediately after the syscall returns. This means the
* kernel only ever sees one allocation at a time — no temporal overlap.
*
* By queueing allocations for delayed free (5-50 more syscalls), we
* keep multiple allocations alive simultaneously, increasing the chance
* of hitting UAF, stale-reference, and double-free bugs in the kernel.
*
* The queue is a flat array scanned linearly. At 64 entries and
* ~10 bytes per entry, this is fast enough — children do millions of
* syscalls, so the tick overhead is negligible.
*/
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <sys/mman.h>
#include "deferred-free.h"
#include "pc_format.h"
#include "random.h"
#include "shm.h"
#include "trinity.h"
#include "utils.h"
#define DEFERRED_RING_SIZE 64
#define DEFERRED_TTL_MIN 5
#define DEFERRED_TTL_MAX 50
/*
* Run the actual TTL-decrement-and-free loop on 1-in-N tick calls.
* The other (N-1) calls bail before taking the mprotect bracket.
* N must be a power of two so the modulo collapses to a bitmask.
*
* Side effect: TTL is effectively multiplied by N. Nominal range
* 5-50 syscalls becomes 80-800 syscalls of in-ring lifetime. This
* is fine -- and arguably better for catching UAF overlap -- but
* worth knowing when reading the TTL constants above.
*
* 8 was insufficient for the head->array container lifetime in
* add_object's OBJ_LOCAL grow path: a get_random_object() reader
* interrupted by a signal whose handler runs syscalls (ticking the
* ring) while the original code holds head->array in a register/
* cache can outlive a 40-400 syscall TTL when the signal handler
* is heavy. 16 keeps the same shape but lifts the headroom to a
* range no plausible reader window touches.
*/
#define DEFERRED_TICK_BATCH 16
struct deferred_entry {
void *ptr;
void (*free_func)(void *);
unsigned int ttl;
};
/*
* Side-set of "live" malloc results. __zmalloc() registers every
* pointer it returns; deferred_free_enqueue() consumes the matching
* entry to confirm the pointer it has been handed is a real malloc
* result before letting it through to free().
*
* The pre-existing looks_like_corrupted_ptr() heuristic only rejects
* sub-page / above-canonical / mis-aligned values. A wholesale stomp
* that scribbles rec->post_state (or rec->aN) with an address that
* happens to land inside the heap arena passes every band of that test
* -- 8-byte aligned, in user VA, not pid-shaped -- yet is not a real
* malloc-return. Eight ASAN "bad-free" reports hit exactly that gap:
* the freed pointer was heap-region but not at an allocation start, so
* libc's free() rejects it. Tracking the set of live malloc results
* gives us ground truth that the pointer-shape heuristic can't.
*
* Sized for the in-flight window: between a sanitise's zmalloc and the
* matching post handler's deferred_free_enqueue, the same syscall does
* a handful of additional zmallocs (snap struct, arg generators, etc.)
* -- well under a hundred in the worst case. 256 entries gives ample
* headroom; on overflow we evict in arrival order, which only causes a
* benign drop (memory leak) of the evicted pointer's eventual free.
*
* Process-local: zero-initialised BSS, COW-shared at fork, written
* single-threaded by the owning child. No locking needed.
*/
#define ALLOC_TRACK_SIZE 256
static void *alloc_track[ALLOC_TRACK_SIZE];
static unsigned int alloc_track_head;
void deferred_alloc_track(void *ptr)
{
if (ptr == NULL)
return;
alloc_track[alloc_track_head % ALLOC_TRACK_SIZE] = ptr;
alloc_track_head++;
}
/*
* Consume the entry matching @ptr. Returns true if found (and clears
* the slot); false if the pointer was not in the side-set, meaning the
* caller is about to free something __zmalloc() never produced.
*
* Scan backward from the newest entry. The vast majority of consumers
* are post handlers running a few syscalls after the matching __zmalloc()
* (PATHNAME / IOVEC / SOCKADDR generators enqueue 1-3 pointers per arg),
* so the hit lives near the head. Average match distance drops from
* ALLOC_TRACK_SIZE/2 to a handful of compares; miss cost is unchanged.
*/
static bool alloc_track_consume(void *ptr)
{
unsigned int idx = (alloc_track_head - 1) & (ALLOC_TRACK_SIZE - 1);
unsigned int i;
for (i = 0; i < ALLOC_TRACK_SIZE; i++) {
if (alloc_track[idx] == ptr) {
alloc_track[idx] = NULL;
return true;
}
idx = (idx - 1) & (ALLOC_TRACK_SIZE - 1);
}
return false;
}
/*
* Ring storage lives in an mmap'd region whose address range is registered
* with shared_regions[] via track_shared_region(). That tracking lets
* avoid_shared_buffer() and the mm-syscall sanitisers refuse fuzzed
* pointers/lengths that would land inside the ring -- previously the array
* lived in trinity's BSS, which is NOT registered with shared_regions[],
* so a fuzzed write could scribble ring[i].ptr with a pid-shaped value
* (residual-cores triage matched si_addr=0x378a02 against the killing
* process's pid) and the next deferred_free_tick() would free() the bogus
* pointer.
*
* MAP_PRIVATE (not MAP_SHARED via alloc_shared()) is deliberate: the queue
* is process-local by contract -- pointers come from each child's own
* post-fork heap. Sharing the ring across forks would let one child's
* deferred_free_tick() free a pointer enqueued by a different child --
* either a double free if both children reach ttl==0 on the same slot, or
* cross-heap chunk-metadata corruption because the freeing child's glibc
* has no record of an allocation at that address. Each forked child needs
* its own COW copy of the ring; only the address range is shared with
* the tracker.
*/
static struct deferred_entry *ring;
static unsigned int ring_count;
static size_t ring_bytes;
/*
* One bit per ring slot: 1 == occupied, 0 == free. Lets enqueue find
* the next empty slot in O(1) via __builtin_ctzll(~occupied_mask)
* instead of a linear scan over all 64 entries. Maintained alongside
* ring_count: every ptr write that fills a slot sets the bit, every
* clear that empties a slot clears it. BSS-resident (not inside the
* mprotect-bracketed ring), so the cheap scan in enqueue's full-ring
* check and the ctzll lookup itself need no unlock. uint64_t suffices
* because DEFERRED_RING_SIZE == 64; a static_assert would be overkill
* for a single contiguous file.
*/
static uint64_t occupied_mask;
/*
* Bracket every writer/reader of ring[] with mprotect(). Between
* ticks the ring sits at PROT_NONE; any fuzzed value-result syscall
* that tries to scribble inside it now SIGSEGVs in the kernel's
* copy_from_user instead of silently overwriting ring[i].ptr with a
* pid-shaped value (the cluster-1 root cause: ~200 SIGSEGVs at
* deferred_free_tick+0x49 with si_addr ~= si_pid). mprotect is
* async-signal-safe so these are safe to call from anywhere
* deferred_free_* is reachable.
*
* ring_unlock() returns false on mprotect failure so callers bail out
* before touching ring[]. Failure is rare but does happen under
* fuzzing pressure (kernel VMA-limit ENOMEM when the per-process
* map_count cap is approached, transient EAGAIN under memory pressure,
* or a not-yet-sanitised mm-syscall slipping past the shared-region
* filter and modifying the ring's VMA). When the original bracket
* landed it logged-and-returned, leaving the page at PROT_NONE while
* the caller fell through into the ring access loop -- ~311 self-
* inflicted SEGV_ACCERR crashes per 1.5h fuzz run with si_addr
* matching the ring page, split across deferred_free_tick+0x7e
* (the ring[i].ttl read in the loop body) and deferred_free_enqueue
* +0x89 (the ring[i].ptr == NULL slot scan).
*/
static bool ring_unlock(void)
{
if (mprotect(ring, ring_bytes, PROT_READ | PROT_WRITE) != 0) {
outputerr("deferred_free: mprotect RW failed: %s\n",
strerror(errno));
return false;
}
return true;
}
static void ring_lock(void)
{
if (mprotect(ring, ring_bytes, PROT_NONE) != 0)
outputerr("deferred_free: mprotect NONE failed: %s\n",
strerror(errno));
}
void deferred_free_init(void)
{
const size_t raw = sizeof(struct deferred_entry) * DEFERRED_RING_SIZE;
ring_bytes = ((raw + page_size - 1) / page_size) * page_size;
ring = mmap(NULL, ring_bytes, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, -1, 0);
if (ring == MAP_FAILED) {
outputerr("deferred_free_init: mmap %zu failed\n", ring_bytes);
exit(EXIT_FAILURE);
}
memset(ring, 0, ring_bytes);
track_shared_region((unsigned long)ring, ring_bytes);
ring_count = 0;
occupied_mask = 0;
ring_lock();
/*
* Cache the brk-arena extent now, before any child forks. Every
* child inherits the bounds via COW BSS so is_in_glibc_heap()
* needs no further /proc/self/maps reads at runtime. Read here
* rather than at use-site: a syscall fuzzer parsing /proc on
* every deferred_free_enqueue would dwarf the work it's gating.
*/
heap_bounds_init();
}
void deferred_free_enqueue(void *ptr, void (*free_func)(void *))
{
unsigned int i;
if (ptr == NULL)
return;
if (free_func == NULL)
free_func = free;
/*
* Alignment is non-negotiable. glibc malloc returns >= 8-byte
* aligned chunks on x86_64, so a free() candidate with low bits set
* cannot be a real allocation start. libasan internally CHECKs
* alignment in its poisoning path (asan_poisoning.cpp:
* "AddrIsAlignedByGranularity(addr) != 0") and aborts the child on
* the misaligned address before its bad-free reporter ever runs --
* the cluster shows up as a CHECK-failed crash without an ASAN
* report attached, which is harder to triage than a normal bad-free.
*
* Enforced unconditionally (independent of free_func) because an
* alt-allocator wrapper that ultimately routes to libasan-protected
* free() inherits the same alignment constraint -- the sentinel /
* non-heap-token tolerance carved out for custom free_func callers
* by the bands below cannot relax this one. The existing shape
* heuristic also rejects misaligned values, but only for the
* free_func == free path; this guard closes the custom-free_func
* gap and provides a single explicit chokepoint that survives
* future refactors of the conditional bands.
*/
if (((unsigned long)ptr & 0x7) != 0) {
static unsigned long misalign_drops;
unsigned long n = ++misalign_drops;
if ((n % 1000) == 1) {
char pcbuf[128];
outputerr("deferred_free_enqueue: rejected misaligned "
"ptr=%p caller=%s [%lu cumulative]\n", ptr,
pc_to_string(__builtin_return_address(0),
pcbuf, sizeof(pcbuf)), n);
}
return;
}
/*
* Reject pid-scribbled / canonical-out-of-range / misaligned values
* BEFORE they ever reach the ring. Cluster-1/2/3 root cause
* (residual-cores triage 2026-05-02): a sibling fuzzed value-result
* syscall scribbles a tid/pid into rec->aN, the post handler does
* deferred_freeptr(&rec->aN) which arrives here, and N syscalls
* later deferred_free_tick() free()s the pid -- SIGSEGV with
* si_addr==si_pid. Drop the bad value at the post-handler boundary
* (one counter bumped, ring slot stays empty) instead of letting
* the corruption propagate into the ring. Gated on free_func==free
* because custom free funcs may legitimately receive non-heap
* tokens (caller knows what they're doing); same gating convention
* as the range_overlaps_shared check below.
*/
if (free_func == free &&
looks_like_corrupted_ptr_pc(NULL, ptr, __builtin_return_address(0))) {
outputerr("deferred_free_enqueue: rejected suspicious ptr=%p "
"(pid-scribbled?)\n", ptr);
return;
}
/*
* Heap-bounds backstop: every pointer __zmalloc() can hand back
* lives inside the brk arena cached at init time. A scribbled
* snapshot/arg slot whose value passes the shape heuristic above
* but lands in the stack, an mmap'd library, an executable
* mapping, or one of trinity's own MAP_PRIVATE regions cannot be
* a real malloc result -- free()ing it is undefined. Two
* compares, branch-predictable, no syscalls; cheaper than the
* O(N) alloc-track scan below and catches the case where the
* stomp value coincidentally matches a recently-evicted ring
* slot the alloc-track ring no longer remembers. Custom
* free_func callers stay exempt -- mirrors the gating convention
* used by the shape and shared-region bands.
*/
if (free_func == free && !is_in_glibc_heap(ptr)) {
static unsigned long non_heap_drops;
unsigned long n = ++non_heap_drops;
if ((n % 1000) == 1) {
char pcbuf[128];
outputerr("deferred_free_enqueue: rejected ptr=%p "
"(outside glibc heap) caller=%s "
"[%lu cumulative]\n", ptr,
pc_to_string(__builtin_return_address(0),
pcbuf, sizeof(pcbuf)), n);
}
__atomic_add_fetch(&shm->stats.snapshot_non_heap_reject, 1, __ATOMIC_RELAXED);
return;
}
/*
* Ground-truth check: refuse to enqueue a pointer that __zmalloc()
* never produced. Catches the bad-free class where a sibling stomp
* (or kernel write into a mistakenly-aliased rec field) overwrites
* a snapshot/arg slot with a heap-region-shaped value that defeats
* the heuristic guard above. Eight ASAN bad-frees in a recent run
* all matched this shape: 8-byte aligned, in user VA, sitting inside
* the heap arena, but not at any malloc-returned offset. The custom
* free_func path is exempt -- callers using their own free routine
* may legitimately pass non-heap tokens (sentinel values, mmap
* pointers managed by the alt allocator) the same gating convention
* the looks_like_corrupted_ptr check above uses.
*/
if (free_func == free && !alloc_track_consume(ptr)) {
static unsigned long unknown_drops;
unsigned long n = ++unknown_drops;
if ((n % 1000) == 1) {
char pcbuf[128];
outputerr("deferred_free_enqueue: rejected ptr=%p "
"(not a tracked allocation) caller=%s "
"[%lu cumulative]\n", ptr,
pc_to_string(__builtin_return_address(0),
pcbuf, sizeof(pcbuf)), n);
}
post_handler_corrupt_ptr_bump(NULL, __builtin_return_address(0));
return;
}
/*
* Refuse to enqueue a pointer that lands inside one of trinity's
* own mmap'd shared regions. ASAN catches these as bad-free
* (libasan: "attempting free on address which was not malloc()-ed"),
* non-ASAN runs silently corrupt the glibc allocator. Either way
* the underlying bug is some arg generator handing back a tracked-
* mmap pointer for an arg slot whose argtype (PATHNAME, IOVEC,
* SOCKADDR) generic_free_arg expects to be heap-allocated.
*
* Logging the caller PC so we can still find the offending
* generator -- the guard fixes the symptom but the rejection log
* is the breadcrumb to the root cause. Limited to one print per
* 1000 rejects to keep noise sane.
*
* This range check runs BEFORE ring_unlock() so we don't pay the
* mprotect cost on rejected enqueues.
*/
if (range_overlaps_shared((unsigned long)ptr, 1) && free_func == free) {
static unsigned long rejects;
unsigned long n = ++rejects;
if ((n % 1000) == 1) {
char pcbuf[128];
outputerr("deferred_free_enqueue: rejected ptr=%p "
"(overlaps shared region) caller=%s "
"[%lu cumulative]\n", ptr,
pc_to_string(__builtin_return_address(0),
pcbuf, sizeof(pcbuf)), n);
}
return;
}
/* If ring_unlock() fails the page stays PROT_NONE; falling
* through into the slot scan would SEGV_ACCERR. Free the ptr
* directly so the caller's contract (ptr is no longer their
* problem) still holds. */
if (!ring_unlock()) {
free_func(ptr);
return;
}
/* If the ring is full, force-free the oldest (lowest TTL) entry
* to make room. In practice this rarely happens — TTL range
* is 5-50 and we tick every syscall. */
if (ring_count == DEFERRED_RING_SIZE) {
unsigned int oldest = 0;
unsigned int min_ttl = UINT_MAX;
for (i = 0; i < DEFERRED_RING_SIZE; i++) {
if (ring[i].ptr != NULL && ring[i].ttl < min_ttl) {
min_ttl = ring[i].ttl;
oldest = i;
}
}
if (ring[oldest].ptr != NULL && ring[oldest].free_func != NULL) {
ring[oldest].free_func(ring[oldest].ptr);
ring[oldest].ptr = NULL;
occupied_mask &= ~(1ULL << oldest);
ring_count--;
}
}
/* Find an empty slot. After the full-ring eviction above, at
* least one bit in occupied_mask is clear, so ~occupied_mask is
* non-zero and __builtin_ctzll's UB-on-zero case can't fire. */
i = __builtin_ctzll(~occupied_mask);
ring[i].ptr = ptr;
ring[i].free_func = free_func;
ring[i].ttl = RAND_RANGE(DEFERRED_TTL_MIN, DEFERRED_TTL_MAX);
occupied_mask |= 1ULL << i;
ring_count++;
ring_lock();
}
void deferred_freeptr(unsigned long *p)
{
void *ptr = (void *) *p;
*p = 0;
deferred_free_enqueue(ptr, NULL);
}
/*
* Free one ring entry's payload, dropping it if the pointer fails the
* sanity bands. Both the tick (TTL expiry) and flush (child exit)
* paths route through here -- pre-helper, only tick had these checks,
* so a corrupted ring entry surviving until child exit would silently
* free a bogus pointer through deferred_free_flush(). The tick guard
* rejected ~47.7k corrupt-pointer scribbles in a single 6.76h run
* (~2/sec), so the ring DOES get scribbled in practice; every entry
* the tick guard would have rejected was being silently freed by
* flush instead.
*
* Caller must clear ring[slot].ptr (and decrement ring_count where
* it tracks per-slot) before calling. Clearing first means a signal
* that longjmps out of fn() can't leave a freed pointer pending in
* the ring.
*
* Sub-page guard: a ptr below 0x10000 cannot be a real heap address;
* almost certainly a fuzzed value-result syscall scribbled a pid-shape
* into the slot. Drop rather than crash on free().
*
* Alignment guard: defense in depth at the free() boundary. glibc
* malloc returns >= 8-byte aligned chunks; libasan internally CHECKs
* alignment and aborts the child without an ASAN bad-free report
* (asan_poisoning.cpp: "AddrIsAlignedByGranularity(addr) != 0"),
* which is harder to triage than a normal bad-free.
*/
static void free_ring_entry(void *ptr, void (*fn)(void *), unsigned int slot)
{
if ((unsigned long)ptr < 0x10000) {
outputerr("deferred_free: rejected suspicious ptr=%p "
"in slot %u (looks pid-shaped)\n", ptr, slot);
__atomic_add_fetch(&shm->stats.deferred_free_corrupt_ptr, 1, __ATOMIC_RELAXED);
return;
}
if (((unsigned long)ptr & 0x7) != 0) {
outputerr("deferred_free: rejected misaligned ptr=%p "
"in slot %u\n", ptr, slot);
__atomic_add_fetch(&shm->stats.deferred_free_corrupt_ptr, 1, __ATOMIC_RELAXED);
return;
}
fn(ptr);
}
void deferred_free_tick(void)
{
static unsigned int tick_count;
unsigned int i;
/* Cheap path: ring_count is read while still locked, but it lives
* in BSS (not in the protected ring), so this access is safe. */
if (ring_count == 0)
return;
/*
* Batch ticks: run the full mprotect+walk+free bracket only on
* 1-in-DEFERRED_TICK_BATCH calls. The other calls bail here
* without taking the mprotect bracket -- ~7x reduction in
* mprotect syscalls (and matching TLB-shootdown traffic across
* sibling fuzz children). See DEFERRED_TICK_BATCH comment for
* the TTL-multiplier side effect.
*
* tick_count is BSS-resident (not in the ring), and per-child by
* fork's COW, so this static is safe to touch without unlocking.
*/
if ((++tick_count & (DEFERRED_TICK_BATCH - 1)) != 0)
return;
/* On unlock failure the page is still PROT_NONE; bail rather
* than SEGV_ACCERR in the loop below. Entries stay queued and
* will be retried on the next tick. */
if (!ring_unlock())
return;
for (i = 0; i < DEFERRED_RING_SIZE; i++) {
void *ptr;
void (*fn)(void *);
if (ring[i].ptr == NULL)
continue;
if (ring[i].ttl > 0) {
ring[i].ttl--;
continue;
}
/* TTL expired — free it. Clear the slot BEFORE calling
* the free function so that if a signal interrupts us
* mid-free and we longjmp, the slot is already empty. */
ptr = ring[i].ptr;
fn = ring[i].free_func;
ring[i].ptr = NULL;
occupied_mask &= ~(1ULL << i);
ring_count--;
free_ring_entry(ptr, fn, i);
}
ring_lock();
}
void deferred_free_flush(void)
{
unsigned int i;
/* Called from the child exit path; if unlock fails the deferred
* ptrs leak, but the child is going away so the kernel reaps
* them at exit. Better than SEGV_ACCERR-ing on the way out. */
if (!ring_unlock())
return;
for (i = 0; i < DEFERRED_RING_SIZE; i++) {
void *ptr;
void (*fn)(void *);
if (ring[i].ptr == NULL)
continue;
/* Clear before invoking, mirroring tick: a signal that
* longjmps mid-free leaves the slot empty either way. */
ptr = ring[i].ptr;
fn = ring[i].free_func;
ring[i].ptr = NULL;
free_ring_entry(ptr, fn, i);
}
ring_count = 0;
occupied_mask = 0;
ring_lock();
}