trinity/healer.c at master · kernelslacker/trinity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
 * HEALER syscall-relation observer (Phase A: instrumentation only).
 *
 * Records (predset -> nr) edges discovered when a new kcov PC edge fires
 * after a syscall sequence whose two-syscall predecessor window has been
 * captured in the per-child sequence buffer.  Phase A populates the
 * relation table only -- the syscall picker does not consult it, no
 * STRATEGY_HEALER bandit arm exists yet, and there is no CLI flag.  The
 * goal is to gather enough operator-visible data over a review window
 * to validate that the relations the observer learns look plausible
 * before authorising the picker work in Phase B.
 *
 * See include/healer.h for the data structure and per-field comments,
 * and ~/gdrive/Obsidian/projects/trinity/trinity-todo.md (Multi-Strategy
 * Rotation Phase 2 -> HEALER section) for the broader two-phase design.
 *
 * --- Lineage note ---
 *
 * The name HEALER comes from the SOSP'21 paper "HEALER: Relation
 * Learning Guided Kernel Fuzzing" (Sun et al.), which seeds an
 * influence relation matrix R[a][b] from MoonShine-style static
 * field analysis of kernel handlers and refines it dynamically via
 * observed coverage gain.  This implementation is HEALER-INSPIRED
 * rather than a faithful port; meaningful divergences worth noting
 * for anyone reading the literature alongside the code:
 *
 *   - The original HEALER stores PAIRS (a -> b) in a 2D matrix.
 *     This implementation tracks TRIPLES ((pred_a, pred_b) -> succ)
 *     in a sparse hash table.  The pair table introduced by the
 *     static-seed work is parallel storage, not the primary unit.
 *
 *   - The original HEALER's static prior comes from MoonShine's
 *     read/write field analysis on kernel sources.  Trinity has no
 *     such analyser; the static seed here is a coarser approximation
 *     derived from trinity's own ARG_FD_* / ret_objtype metadata
 *     (producer syscall A's return type matches a typed-arg slot of
 *     consumer B).  Probably ~80% of MoonShine's signal at zero
 *     analysis cost.
 *
 *   - The original HEALER's R is consumed by syzkaller's program
 *     generator to bias A->B sequence picks.  The trinity picker
 *     does NOT yet consult this table -- the bandit/explorer
 *     strategies pick syscalls from coverage feedback alone.
 *     "Phase B" above is where this is supposed to change.
 *
 *   - Beta-Binomial smoothed score normalisation, per-predecessor
 *     frequency tracking, decay walks, the corrupt-entry filter, and
 *     the low-confidence and minimum-raw qualification floors are all
 *     trinity-specific additions that don't appear in the original
 *     paper.
 *
 * In practice the module is closer to a "syscall relation observer
 * + dump" than to HEALER's program generator.  Name retained because
 * the intellectual lineage is real and the literature reference is
 * useful for new contributors.
 */

#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/utsname.h>
#include <time.h>
#include <unistd.h>

#include "arch.h"		/* biarch */
#include "child.h"
#include "edgepair.h"		/* EDGEPAIR_NO_PREV */
#include "exit.h"		/* EXIT_NO_SYSCALLS_ENABLED */
#include "healer.h"
#include "locks.h"
#include "params.h"		/* do_32_arch, do_64_arch */
#include "pids.h"		/* getpid wrapper */
#include "random.h"		/* ONE_IN */
#include "shm.h"
#include "stats.h"
#include "syscall.h"		/* set_syscall_nr_random, EXPENSIVE */
#include "tables.h"		/* print_syscall_name */
#include "trinity.h"

/*
 * Number of (predset, promoted_nr) tuples surfaced by the periodic
 * dump.  Kept small enough that the per-tick output stays readable
 * even at saturated load -- the underlying table can hold up to
 * HEALER_RELATION_SLOTS * HEALER_PROMOTED_PER_SLOT (= 128K) entries
 * if it ever fills, far more than an operator could scan inline.
 */
#define HEALER_DUMP_TOP_N 10

/*
 * Low-confidence floor for top-N qualification.  Entries whose
 * predecessor pair has a zero appearance counter on at least one side
 * lack this-run evidence and are dropped before the normalised score
 * is even computed -- a complementary mechanism to the
 * HEALER_NORM_ALPHA / HEALER_NORM_BETA Bayesian shrinkage that dampens
 * the score for entries that survive the filter (predfreq >= 1 on both
 * sides) and for the pair-side dump path which has no equivalent
 * filter at all.
 * Two real shapes hit this filter:
 *   - warm-started entries inherited from a prior run, whose pair
 *     hasn't yet been observed in the new run (both counters still 0);
 *   - predecessor-skipped leftovers, where one side of the pair is a
 *     syscall the observer-hook gate now refuses to feed into
 *     healer_seq (e.g. a syscall returning ENOSYS on this kernel),
 *     so its appearance counter stays 0 forever and the entry
 *     persists in the table until decay+eviction reclaims it.
 * Filter at top-N qualification only; the observation, save/load and
 * decay paths keep handling these entries normally.
 */
#define HEALER_DUMP_MIN_PRED_APPEARANCES 1

/*
 * Minimum raw-observation floor for top-N qualification.  A promoted
 * entry's weight is the raw count of times that (predset -> successor)
 * triple has been observed; entries with raw=1 are single-shot events
 * and carry no statistical signal yet.  Without this floor the
 * normalisation in healer_normalised_score_milli amplifies a small
 * observation count into a misleadingly high norm score whenever the
 * combined predecessor frequency is also small -- the noise pairs that
 * dominated the top-10 before the bias bumps were precisely small-raw
 * entries riding a small denominator.  Filter at top-N qualification
 * only; observation, save/load and decay all keep handling these
 * entries normally so they remain available to graduate into the
 * ranking once they accumulate evidence.
 */
#define HEALER_DUMP_MIN_RAW 20

/*
 * Initial weight installed by the static-seed loader for each producer/
 * consumer pair derived from the existing ret_objtype / argtype metadata
 * the syscall table already carries.  Held at the same value as
 * HEALER_DUMP_MIN_RAW so seeded pairs clear the top-N qualification
 * floor immediately, but a real triple with raw>=4 still beats them
 * once observations actually accumulate.  Bootstraps the picker's pair
 * prior on cold runs without dominating the ranking long-term.
 */
#define HEALER_STATIC_SEED_WEIGHT 3

/*
 * Bayesian smoothing constants applied to the relation-table dump's
 * normalised score.  The displayed score is
 *
 *   norm = (raw + ALPHA) / (predfreq + ALPHA + BETA)
 *
 * which is the posterior mean of a Beta-Binomial model over the
 * "observed | predecessor appeared" rate, with a Beta(ALPHA, BETA)
 * prior centred at ALPHA/(ALPHA+BETA) ~= 0.2.  The shrinkage suppresses
 * spurious co-occurrence pairs whose raw and predfreq are both small
 * (where the unsmoothed raw/predfreq metric inflated noise into the
 * top-N) without distorting high-N entries: at predfreq in the
 * hundreds the +ALPHA / +BETA terms are dwarfed by the live counts and
 * the score tracks the raw ratio closely.  Replaces the prior
 * isqrt-dampened TF-IDF formulation, whose square-root denominator
 * compressed the large-N range too aggressively while still rewarding
 * single-digit-predfreq pairs at the noise floor.
 *
 * ALPHA == 5 matches the pseudo-count the prior PREDFREQ_BIAS used to
 * keep the denominator off its minimum value, calibrated to the
 * dynamic-pair entries that demonstrably carry signal in the first
 * dump tick.  BETA == 20 matches HEALER_DUMP_MIN_RAW: a fresh entry at
 * the qualification floor with no predecessor history scores at most
 * raw/(raw + BETA) == 0.5, capping the prior's influence on barely-
 * qualified entries while letting genuinely repeated triples climb.
 */
#define HEALER_NORM_ALPHA 5
#define HEALER_NORM_BETA  20

/*
 * Display-time pollution filter: minimum total HEALER observation
 * count before the dump suppresses static-seeded entries whose
 * participating syscalls have never been attempted by any child this
 * run.  Below this threshold the dump trusts the seed-derived
 * ranking, since "never attempted" early in a run just means "the
 * random scheduler hasn't picked it yet".  Above it, attempted == 0
 * means the kernel is rejecting the syscall (ENOSYS, missing CONFIG,
 * sandboxed) and the seed's pair will never produce real signal -- so
 * surfacing it in the top-N is pure UX noise (e.g. {*, landlock_*}
 * lines on a no-LANDLOCK kernel that pollute the seed-only section
 * for the entire run).  Replaces the failed startup ENOSYS-probe
 * approach (commits 9ec0ac291dd2 / 3f61a24beebd, both reverted for
 * hangs and fragility) with a no-startup-cost dump-path filter.
 *
 * Sized at 1000 because by the time HEALER has logged that many
 * observations the random scheduler has had ample opportunity to dial
 * any genuinely-supported syscall at least once; an entry still at
 * attempted == 0 at that point is overwhelmingly likely to be one the
 * kernel will keep rejecting.
 */
#define HEALER_POLLUTION_FILTER_THRESHOLD 1000UL

/*
 * FNV-1a parameters from the canonical 32-bit FNV definition.  Used
 * over the byte representation of the sorted (pred_a, pred_b) tuple
 * to derive the initial slot index.  Cheap enough on the (rare)
 * observer-hook path that adding a stronger hash is not worth the
 * dependency cost.
 */
#define FNV1A_OFFSET_BASIS 0x811c9dc5U
#define FNV1A_PRIME        0x01000193U

/*
 * Layout-pinning asserts so the (pred_a, pred_b, predset_hash) tuple
 * accessed via the .key uint64_t union member matches what the
 * struct-member view writes, and likewise for (nr, weight) inside
 * struct healer_promoted.  A future struct reorder that breaks the
 * packing fails to compile rather than silently desynchronising the
 * lockless CAS payload from the field view.  Mirrors the static
 * asserts edgepair.c uses to pin its own packed-key claim path.
 */
_Static_assert(offsetof(struct healer_relation, pred_a) == 0,
	       "pred_a must be at offset 0 for packed CAS");
_Static_assert(offsetof(struct healer_relation, pred_b) == 2,
	       "pred_b must be at offset 2 for packed CAS");
_Static_assert(offsetof(struct healer_relation, predset_hash) == 4,
	       "predset_hash must be at offset 4 for packed CAS");
_Static_assert(sizeof(uint16_t) == 2,
	       "uint16_t must be 2 bytes for packed CAS");
_Static_assert(sizeof(uint32_t) == 4,
	       "uint32_t must be 4 bytes for packed CAS");

_Static_assert(offsetof(struct healer_promoted, nr) == 0,
	       "nr must be at offset 0 for packed CAS");
_Static_assert(offsetof(struct healer_promoted, weight) == 4,
	       "weight must be at offset 4 for packed CAS");
_Static_assert(sizeof(unsigned int) == 4,
	       "unsigned int must be 4 bytes for packed CAS");

static void healer_maybe_decay(void);

static unsigned int healer_predset_hash(unsigned int pred_a, unsigned int pred_b)
{
	uint32_t h = FNV1A_OFFSET_BASIS;
	unsigned char buf[sizeof(unsigned int) * 2];
	size_t i;

	memcpy(buf, &pred_a, sizeof(pred_a));
	memcpy(buf + sizeof(pred_a), &pred_b, sizeof(pred_b));

	for (i = 0; i < sizeof(buf); i++) {
		h ^= buf[i];
		h *= FNV1A_PRIME;
	}

	/* predset_hash == 0 is the empty-slot sentinel; the (vanishingly
	 * rare) FNV-1a output of 0 collapses onto 1 so a real predset
	 * never collides with the empty marker. */
	if (h == 0)
		h = 1;
	return h;
}

void healer_seq_push(struct childdata *child, unsigned int nr)
{
	if (child == NULL)
		return;

	child->healer_seq[0] = child->healer_seq[1];
	child->healer_seq[1] = nr;
	if (child->healer_seq_count < 2)
		child->healer_seq_count++;
}

/*
 * Pack (pred_a, pred_b, predset_hash) into a uint64_t matching the
 * union layout in struct healer_relation.  Goes through a typed
 * temporary + memcpy so the access stays inside the well-defined
 * union view that strict aliasing permits, the same trick edgepair.c
 * uses for its packed (prev_nr, curr_nr) key.
 */
static uint64_t healer_pack_key(unsigned int pred_a, unsigned int pred_b,
				unsigned int predset_hash)
{
	struct {
		uint16_t pa;
		uint16_t pb;
		uint32_t h;
	} tmp = { (uint16_t)pred_a, (uint16_t)pred_b, predset_hash };
	uint64_t packed;

	memcpy(&packed, &tmp, sizeof(packed));
	return packed;
}

/*
 * Unpack pred_a / pred_b out of a previously-loaded slot key.  The
 * dump path uses this so it can pull the full identifier triple from
 * the single ACQUIRE-load of slot->key, rather than re-reading the
 * struct fields and exposing itself to a non-atomic tear against a
 * concurrent CAS-claim.
 */
static void healer_unpack_key(uint64_t key, unsigned int *pred_a,
			      unsigned int *pred_b)
{
	struct {
		uint16_t pa;
		uint16_t pb;
		uint32_t h;
	} tmp;

	memcpy(&tmp, &key, sizeof(tmp));
	*pred_a = tmp.pa;
	*pred_b = tmp.pb;
}

/*
 * Pack (nr, weight) into the uint64_t entry view of struct
 * healer_promoted.  weight == 0 marks an empty entry; a real entry
 * always carries weight >= 1, so the packed value is non-zero
 * whenever the slot is populated.
 */
static uint64_t healer_pack_promoted(unsigned int nr, unsigned int weight)
{
	struct {
		unsigned int n;
		unsigned int w;
	} tmp = { nr, weight };
	uint64_t packed;

	memcpy(&packed, &tmp, sizeof(packed));
	return packed;
}

static void healer_unpack_promoted(uint64_t entry, unsigned int *nr,
				   unsigned int *weight)
{
	struct {
		unsigned int n;
		unsigned int w;
	} tmp;

	memcpy(&tmp, &entry, sizeof(tmp));
	*nr = tmp.n;
	*weight = tmp.w;
}

/*
 * Bump the (predset, current_nr) tuple in `slot`, evicting the lowest-
 * weight existing promoted entry when the slot is full.  Returns true
 * if an eviction took place, so the caller can bump the eviction
 * counter without re-scanning the array.  Lockless: each promoted
 * entry is mutated via a 64-bit CAS on the (nr, weight) packed view,
 * weight == 0 is the empty-entry sentinel, and CAS failure restarts
 * the scan so we re-pick up any state another observer just published
 * (a concurrent insert of the same nr collapses onto the bump path
 * on the next pass; a concurrent eviction reopens an empty entry that
 * the next pass tries to claim before scanning for a new victim).
 */
static bool healer_slot_record(struct healer_relation *slot,
			       unsigned int current_nr)
{
	unsigned int i;
	unsigned int victim_idx = 0;
	unsigned int victim_weight = 0;
	uint64_t victim_packed = 0;
	bool victim_found;
	uint64_t expected, target;
	int restart_budget = HEALER_PROMOTED_PER_SLOT * 2;

restart:
	if (--restart_budget < 0) {
		/* Defensive bound on the lockless retry loop -- under
		 * pathological CAS contention we'd rather drop one
		 * observation than spin forever.  In practice the
		 * observer-hook fire rate keeps per-slot contention
		 * vanishingly small; this exists for the worst-case
		 * tail and never trips in steady state. */
		return false;
	}

	/* Phase 1: scan for an existing (predset, current_nr) entry and
	 * bump its weight.  Atomic load gives us a coherent snapshot of
	 * the (nr, weight) pair; weight == 0 means the entry is empty
	 * (and not in flight, since claim/evict CASes publish weight >=
	 * 1 atomically). */
	for (i = 0; i < HEALER_PROMOTED_PER_SLOT; i++) {
		uint64_t entry;
		unsigned int weight, nr;

		entry = __atomic_load_n(&slot->promoted[i].entry,
					__ATOMIC_RELAXED);
		healer_unpack_promoted(entry, &nr, &weight);

		if (weight != 0 && nr == current_nr) {
			__atomic_fetch_add(&slot->promoted[i].weight,
					   1, __ATOMIC_RELAXED);
			return false;
		}
	}

	/* Phase 2: try to claim an empty entry for current_nr.  CAS the
	 * packed (nr, weight) field from the all-zero empty marker to
	 * (current_nr, 1); a competing observer that wins the CAS for
	 * some nr' instead leaves us to restart, so we either land on
	 * the existing-match path (if nr' == current_nr) or claim a
	 * different empty entry on the next pass. */
	for (i = 0; i < HEALER_PROMOTED_PER_SLOT; i++) {
		uint64_t loaded;

		loaded = __atomic_load_n(&slot->promoted[i].entry,
					 __ATOMIC_RELAXED);
		if (loaded != 0)
			continue;

		expected = 0;
		target = healer_pack_promoted(current_nr, 1);

		if (__atomic_compare_exchange_n(&slot->promoted[i].entry,
						&expected, target, false,
						__ATOMIC_RELEASE,
						__ATOMIC_RELAXED))
			return false;

		/* CAS lost: another observer just published into this
		 * entry.  Restart from Phase 1 in case they published
		 * our nr (in which case we want the bump path). */
		goto restart;
	}

	/* Phase 3: slot is full -- displace the lowest-weight entry.
	 * Mirrors the original eviction policy: the new entry inherits
	 * victim_weight + 1 so a freshly displaced predset is not
	 * instantly re-evicted on its next observation.  CAS the
	 * victim's exact prior packed value so a concurrent bump or
	 * eviction is detected and triggers a re-scan. */
	victim_found = false;
	for (i = 0; i < HEALER_PROMOTED_PER_SLOT; i++) {
		uint64_t entry;
		unsigned int weight, nr;

		entry = __atomic_load_n(&slot->promoted[i].entry,
					__ATOMIC_RELAXED);
		healer_unpack_promoted(entry, &nr, &weight);

		/* Re-check for a concurrent insert of current_nr before
		 * we evict -- another observer might have published our
		 * nr while we were scanning Phases 1-2. */
		if (weight != 0 && nr == current_nr) {
			__atomic_fetch_add(&slot->promoted[i].weight,
					   1, __ATOMIC_RELAXED);
			return false;
		}

		/* A concurrent eviction may have just opened a fresh
		 * empty entry; restart so we can try the cheaper Phase 2
		 * claim path before resorting to another eviction. */
		if (weight == 0)
			goto restart;

		if (!victim_found || weight < victim_weight) {
			victim_idx = i;
			victim_weight = weight;
			victim_packed = entry;
			victim_found = true;
		}
	}

	expected = victim_packed;
	target = healer_pack_promoted(current_nr, victim_weight + 1);

	if (__atomic_compare_exchange_n(&slot->promoted[victim_idx].entry,
					&expected, target, false,
					__ATOMIC_RELEASE,
					__ATOMIC_RELAXED))
		return true;

	/* Victim was bumped or evicted out from under us -- restart. */
	goto restart;
}

void healer_observe_relation(struct childdata *child, unsigned int current_nr)
{
	unsigned int pred_a, pred_b;
	unsigned int predset_hash;
	unsigned int slot_idx;
	unsigned int probe;
	struct healer_relation *table;
	uint64_t target_key;
	bool evicted = false;

	if (child == NULL)
		return;

	/* Need both predecessor slots populated.  The first two syscalls of
	 * a child's life have nothing to point at; skipping them costs us
	 * at most a handful of observations per child lifetime. */
	if (child->healer_seq_count < 2)
		return;

	pred_a = child->healer_seq[0];
	pred_b = child->healer_seq[1];

	/* An EDGEPAIR_NO_PREV sentinel can ride into the buffer if the
	 * child resets its sequence (e.g. between op-types); treat that as
	 * "no usable predset" and skip rather than learning a relation
	 * anchored on a sentinel value. */
	if (pred_a == EDGEPAIR_NO_PREV || pred_b == EDGEPAIR_NO_PREV)
		return;

	if (pred_a > pred_b) {
		unsigned int tmp = pred_a;
		pred_a = pred_b;
		pred_b = tmp;
	}

	/*
	 * Bump the per-syscall predecessor-appearance counter that feeds the
	 * Bayesian-smoothed normalisation on the dump path.  Done before the
	 * lookup so a probe-limit miss still increments the denominator --
	 * the syscall *did* appear as a predecessor in an observation, even
	 * if no slot was available to credit the pair.  Skip the second
	 * bump when pred_a == pred_b so a self-paired predset (same syscall
	 * fired twice in a row) doesn't get double-counted against itself.
	 */
	if (pred_a < MAX_NR_SYSCALL)
		__atomic_add_fetch(&shm->stats.healer_pred_appearance[pred_a],
				   1, __ATOMIC_RELAXED);
	if (pred_b != pred_a && pred_b < MAX_NR_SYSCALL)
		__atomic_add_fetch(&shm->stats.healer_pred_appearance[pred_b],
				   1, __ATOMIC_RELAXED);

	predset_hash = healer_predset_hash(pred_a, pred_b);
	slot_idx = predset_hash & (HEALER_RELATION_SLOTS - 1);
	table = shm->healer_relations;
	target_key = healer_pack_key(pred_a, pred_b, predset_hash);

	for (probe = 0; probe < HEALER_PROBE_LIMIT; probe++) {
		struct healer_relation *slot;
		unsigned int idx;
		uint64_t slot_key;

		idx = (slot_idx + probe) & (HEALER_RELATION_SLOTS - 1);
		slot = &table[idx];
		slot_key = __atomic_load_n(&slot->key, __ATOMIC_ACQUIRE);

		if (slot_key == 0) {
			uint64_t expected = 0;

			if (__atomic_compare_exchange_n(
					&slot->key, &expected, target_key,
					false, __ATOMIC_RELEASE,
					__ATOMIC_RELAXED)) {
				/* Slot is now ours.  Fall through to
				 * healer_slot_record so the first promoted
				 * entry goes in via the same CAS-claim
				 * machinery any concurrent observer also
				 * uses -- a second observer that ACQUIREs
				 * slot->key right after our publish and
				 * races into promoted[0] is handled
				 * uniformly without needing a special
				 * "winner stores promoted[0] directly"
				 * path that would race against them. */
				evicted = healer_slot_record(slot, current_nr);
				break;
			}

			/* CAS lost: another observer claimed this slot.
			 * `expected` now holds the winning key -- if it
			 * matches our predset, fall through to the match
			 * path so we still bump (predset, current_nr). */
			slot_key = expected;
			if (slot_key != target_key)
				continue;
		}

		if (slot_key == target_key) {
			evicted = healer_slot_record(slot, current_nr);
			break;
		}
	}

	if (probe == HEALER_PROBE_LIMIT) {
		/* Ran off the end of the probe window without finding
		 * either the matching predset or an empty slot.  Drop the
		 * observation and surface the table-full event so the
		 * operator can spot a saturated table in the periodic dump. */
		__atomic_fetch_add(&shm->stats.healer_table_full, 1,
				   __ATOMIC_RELAXED);
	}

	__atomic_fetch_add(&shm->stats.healer_relations_observed, 1,
			   __ATOMIC_RELAXED);
	if (evicted)
		__atomic_fetch_add(&shm->stats.healer_evictions, 1,
				   __ATOMIC_RELAXED);

	/* Periodic weight-decay: every HEALER_DECAY_OBSERVATIONS one
	 * CAS-elected observer halves all entry weights (floor 1) so the
	 * relation table converges on persistently-correlated tuples
	 * rather than accumulating weight=1 co-occurrence noise that
	 * eviction alone only sheds at slot saturation.  Cheap fast path
	 * when the gap isn't reached. */
	healer_maybe_decay();
}

/*
 * Periodic weight-decay trigger.
 *
 * Without decay, every (predset, nr) tuple ever observed accumulates
 * weight monotonically until the slot saturates and the lowest-weight
 * entry gets evicted -- which means a single one-time co-occurrence
 * (weight=1) sits in the table forever as long as the slot has room,
 * dragging the top-N dump toward syscall-frequency noise rather than
 * causal correlation.  Periodic halving lets the table converge on
 * persistently-correlated tuples: anything getting bumped at least
 * every other window holds its rank, while a one-shot weight=1 entry
 * stays at the noise floor and gets displaced when a real follow-up
 * needs the slot.
 *
 * 5000 observations: tighter than the snapshot cadence (50000) on
 * purpose -- fleet data shows the observation rate collapses an order
 * of magnitude (~30/sec early-run -> ~0.5/sec post-saturation) once
 * KCOV coverage flattens, so a 50K threshold meant decay never fired
 * during the saturated steady state where it's most needed (top-N
 * dominated by frozen historical bursts that don't reflect current
 * causation).  5K fires several times during the hot phase too, but
 * the decay walk is a single relaxed-atomic sweep so the cost is
 * negligible vs the snapshot cadence on which the operator's only
 * visible artifact (the dump line) rides.  Decoupled from the
 * snapshot interval since the two don't need to share a value.
 */
#define HEALER_DECAY_OBSERVATIONS	5000UL

/*
 * Wall-clock secondary trigger for the decay walk.  The observation-based
 * trigger above only fires when new edges are being discovered, but on a
 * long-running fuzz the KCOV edge set saturates and the observation rate
 * collapses to ~0 -- which leaves the table frozen with whatever
 * historical weights it had at saturation, defeating the purpose of
 * decay (the top-N becomes a snapshot of the early-run discovery burst
 * rather than what's productive right now).  600s is comfortably longer
 * than a hot-phase observation window (decay fires several times during
 * the burst via the observation trigger, well before 10 minutes elapses)
 * but short enough that a saturated steady-state table doesn't sit
 * unaged for hours.  Hardcoded -- no operator knob, no expectation that
 * fleet boxes will need to retune this.
 */
#define HEALER_DECAY_INTERVAL_SEC	1800UL

/*
 * Single-runner election + decay walk.  Mirrors healer_maybe_snapshot's
 * window-CAS pattern so concurrent observers don't all walk the table
 * at the same boundary; the decay walk itself is best-effort relaxed
 * stores against the live table -- a concurrent observer's
 * fetch-add-1 on an entry the decay walk just halved loses at most one
 * weight bump, which the next observation re-credits.  Intentionally
 * cheaper than the snapshot path: no staging buffer, no fsync, no
 * rename -- just one pass over HEALER_RELATION_SLOTS * HEALER_PROMOTED_PER_SLOT
 * entries (= 128K relaxed atomic ops worst case, well under a millisecond).
 */
static void healer_maybe_decay(void)
{
	unsigned long obs_now, old;
	unsigned long now_sec, old_time, new_obs;
	unsigned int i, j;
	bool obs_trigger, time_trigger;

	if (shm == NULL)
		return;

	obs_now = __atomic_load_n(&shm->stats.healer_relations_observed,
				  __ATOMIC_RELAXED);
	old = __atomic_load_n(&shm->stats.healer_obs_at_last_decay,
			      __ATOMIC_RELAXED);
	old_time = __atomic_load_n(&shm->stats.healer_time_at_last_decay,
				   __ATOMIC_RELAXED);
	now_sec = (unsigned long)time(NULL);

	obs_trigger = (obs_now >= old + HEALER_DECAY_OBSERVATIONS);
	time_trigger = (now_sec >= old_time + HEALER_DECAY_INTERVAL_SEC);

	if (!obs_trigger && !time_trigger)
		return;

	/* Window-CAS election on healer_obs_at_last_decay: whichever observer
	 * wins owns this decay boundary; losers see the advanced high-water-
	 * mark on their next call and early-return.  RELAXED is enough -- the
	 * walk's atomic stores carry their own ordering against concurrent
	 * observer bumps, and the counter is just gating who runs, not what
	 * they observe.
	 *
	 * When only the time trigger fires, obs_now may equal `old` (no new
	 * observations since the last decay), and a CAS of (old -> old) would
	 * succeed for every concurrent observer rather than electing one.
	 * Force the new value to be strictly greater in that case so the CAS
	 * is a real change and contested calls actually serialise.  The +1
	 * skew on the next observation-trigger boundary is irrelevant against
	 * the 5000-observation window. */
	new_obs = (obs_now > old) ? obs_now : old + 1;
	if (!__atomic_compare_exchange_n(&shm->stats.healer_obs_at_last_decay,
					 &old, new_obs,
					 false,
					 __ATOMIC_RELAXED, __ATOMIC_RELAXED))
		return;

	/* Won the election -- advance the time baseline so the next time-
	 * trigger window starts cleanly regardless of which trigger fired
	 * this time.  No CAS needed: the obs-field election above already
	 * guarantees we're the sole writer for this decay boundary. */
	__atomic_store_n(&shm->stats.healer_time_at_last_decay, now_sec,
			 __ATOMIC_RELAXED);

	/* Walk the table once, halving every populated entry's weight with
	 * a floor of 1.  Floor 1 (rather than evicting weight=0 entries) is
	 * deliberate: a borderline-real relation that hasn't yet accumulated
	 * a second bump still gets one more decay window to prove itself
	 * before the slot's eviction policy gets a crack at it.  Race window
	 * with concurrent observers is fine: a fetch_add on an entry we
	 * halved races, but the worst case is one lost bump per racing
	 * observer per decay -- vanishing against the per-window observation
	 * count. */
	for (i = 0; i < HEALER_RELATION_SLOTS; i++) {
		struct healer_relation *slot = &shm->healer_relations[i];
		uint64_t slot_key;

		slot_key = __atomic_load_n(&slot->key, __ATOMIC_ACQUIRE);
		if (slot_key == 0)
			continue;

		for (j = 0; j < HEALER_PROMOTED_PER_SLOT; j++) {
			uint64_t entry;
			unsigned int weight, nr, halved;

			entry = __atomic_load_n(&slot->promoted[j].entry,
						__ATOMIC_RELAXED);
			healer_unpack_promoted(entry, &nr, &weight);

			if (weight <= 1)
				continue;

			halved = weight / 2;
			if (halved < 1)
				halved = 1;
			__atomic_store_n(&slot->promoted[j].weight, halved,
					 __ATOMIC_RELAXED);
		}
	}

	__atomic_fetch_add(&shm->stats.healer_weight_decays_run, 1,
			   __ATOMIC_RELAXED);
}

/*
 * Snapshot tuple emitted to the dump's top-N selector.  Each promoted
 * entry yields one of these; predset_hash is omitted because the
 * (pred_a, pred_b) pair is already self-identifying for the dump's
 * grouping needs.  pred_a_freq / pred_b_freq are the per-predecessor
 * appearance counters captured at scan time, kept on the entry so the
 * sort-by-normalised-score and the per-line display both work from the
 * same snapshot value (re-reading the live counter for the display
 * after sorting on a stale read would risk emitting a normalised score
 * that doesn't match the freq numbers shown next to it).
 *
 * Pair-table entries reuse this struct so the same sort and top-N
 * selection ranks triples and pairs head-to-head by normalised score.
 * For a pair entry (is_pair == true): pred_b carries the producer
 * syscall, promoted_nr carries the consumer, pred_b_freq carries the
 * producer's appearance count, and pred_a / pred_a_freq are unused
 * (the rendered line shows `*` in the slot a triple would put pred_a).
 */
struct healer_dump_entry {
	unsigned int pred_a;
	unsigned int pred_b;
	unsigned int promoted_nr;
	unsigned int weight;
	unsigned long pred_a_freq;
	unsigned long pred_b_freq;
	unsigned long norm_score_milli;
	bool is_pair;
};

/*
 * Integer square root via Newton's method, used by the dump-path
 * normalisation.  Trinity has no isqrt helper of its own and pulling in
 * libm just for sqrt() on a once-per-dump-tick path is overkill; this
 * converges in a handful of iterations for the input range we care
 * about (per-syscall appearance products fit comfortably in 64 bits
 * across any realistic fuzz duration).
 */
static unsigned long healer_isqrt(unsigned long n)
{
	unsigned long x, y;

	if (n < 2)
		return n;
	x = n;
	y = (x + 1) / 2;
	while (y < x) {
		x = y;
		y = (x + n / x) / 2;
	}
	return x;
}

/*
 * Bayesian-smoothed normalisation for the relation-table dump.  Returns
 * a fixed-point score scaled by 1000 so the dump path can sort and emit
 * three-decimal precision without dragging floating point onto the hot
 * stats output.
 *
 * Formula:
 *   combined_freq = isqrt(pred_a_freq * pred_b_freq)
 *   norm          = (raw_weight + ALPHA) * 1000
 *                 / (combined_freq + ALPHA + BETA)
 *
 * The two predecessor appearance counts are folded into a single
 * combined frequency via geometric mean, matching the prior formula's
 * intent that a triple with one rare predecessor still scores
 * meaningfully higher than one with two frequent predecessors.  The
 * Beta(ALPHA, BETA) pseudo-counts then shrink the resulting ratio
 * toward the prior mean, so a small-raw / small-predfreq pair no
 * longer races to the top of the ranking on a tiny denominator.  At
 * combined_freq in the hundreds the +ALPHA / +BETA terms are dominated
 * by the live counts and the score tracks the raw ratio closely, so
 * genuinely high-signal entries are not perturbed.
 *
 * Raw weight is preserved on the per-entry display so the operator can
 * still see the underlying signal alongside the normalised ranking.
 */
static unsigned long healer_normalised_score_milli(unsigned long raw_weight,
						   unsigned long pred_a_freq,
						   unsigned long pred_b_freq)
{
	unsigned long combined_freq = healer_isqrt(pred_a_freq * pred_b_freq);
	unsigned long denom = combined_freq + HEALER_NORM_ALPHA +
			      HEALER_NORM_BETA;

	return ((raw_weight + HEALER_NORM_ALPHA) * 1000UL) / denom;
}

static int healer_dump_entry_cmp(const void *a, const void *b)
{
	const struct healer_dump_entry *ea = a;
	const struct healer_dump_entry *eb = b;

	if (ea->norm_score_milli > eb->norm_score_milli)
		return -1;
	if (ea->norm_score_milli < eb->norm_score_milli)
		return 1;
	return 0;
}

/*
 * Bounded-size top-N insertion shared by the triple and pair
 * collection loops.  Holds the top HEALER_DUMP_TOP_N entries in
 * unsorted form: fills until full, then evicts the slot with the
 * smallest norm_score_milli when a higher-scoring candidate arrives.
 * Hoisted out of the dump body so the same heap-replacement logic
 * does not have to be repeated for the dynamic and seed-only pools
 * the caller now keeps separate.
 */
static void healer_top_n_insert(struct healer_dump_entry *top,
				unsigned int *top_count,
				const struct healer_dump_entry *cand)
{
	unsigned int min_idx = 0;
	unsigned int k;

	if (*top_count < HEALER_DUMP_TOP_N) {
		top[*top_count] = *cand;
		(*top_count)++;
		return;
	}

	for (k = 1; k < HEALER_DUMP_TOP_N; k++) {
		if (top[k].norm_score_milli < top[min_idx].norm_score_milli)
			min_idx = k;
	}
	if (cand->norm_score_milli > top[min_idx].norm_score_milli)
		top[min_idx] = *cand;
}

/*
 * Emit one ranked section of the top-N dump.  Caller passes a header
 * format string with a single %u for the count plus the already-sorted
 * top[] slice; this just walks the slice and prints each entry in the
 * same shape healer_table_dump used to print a single combined block.
 * Pair entries render with `*` in the slot a triple would put pred_a so
 * the operator can tell at a glance which lines come from the
 * producer/consumer prior vs the runtime-observed triple table.
 */
static void healer_dump_emit_top(const char *header_fmt,
				 const struct healer_dump_entry *top,
				 unsigned int count)
{
	unsigned int i;

	stats_log_write(header_fmt, count);
	for (i = 0; i < count; i++) {
		if (top[i].is_pair) {
			stats_log_write("  {*, %s} -> %s norm=%lu.%03lu (raw=%u, predfreq=%lu)\n",
					print_syscall_name(top[i].pred_b, false),
					print_syscall_name(top[i].promoted_nr, false),
					top[i].norm_score_milli / 1000,
					top[i].norm_score_milli % 1000,
					top[i].weight,
					top[i].pred_b_freq);
			continue;
		}
		stats_log_write("  {%s, %s} -> %s norm=%lu.%03lu (raw=%u, predfreq a=%lu b=%lu)\n",
				print_syscall_name(top[i].pred_a, false),
				print_syscall_name(top[i].pred_b, false),
				print_syscall_name(top[i].promoted_nr, false),
				top[i].norm_score_milli / 1000,
				top[i].norm_score_milli % 1000,
				top[i].weight,
				top[i].pred_a_freq,
				top[i].pred_b_freq);
	}
}

/*
 * Per-syscall aggregate used by the predecessor-frequency leader
 * display: appearance counter snapshot + sum of weights where this
 * syscall is the *successor* of some pair.  The latter lets the
 * operator distinguish a pure noise rider (high appearance, zero
 * successor weight -- e.g. getppid on the live runs) from a syscall
 * that's frequent as a predecessor and also genuinely productive
 * (high appearance, also non-trivial successor weight).
 */
struct healer_pred_leader {
	unsigned int nr;
	unsigned long appearances;
	unsigned long succ_weight;
};

static int healer_pred_leader_cmp(const void *a, const void *b)
{
	const struct healer_pred_leader *la = a;
	const struct healer_pred_leader *lb = b;

	if (la->appearances > lb->appearances)
		return -1;
	if (la->appearances < lb->appearances)
		return 1;
	return 0;
}

#define HEALER_PRED_LEADERS_TOP_N 5

/*
 * True if the syscall's per-entry successes counter is still zero --
 * i.e. either no child has ever called this syscall this run, OR every
 * call has failed (typical for syscalls the running kernel does not
 * support: landlock_* on a no-LANDLOCK kernel, etc.).  Backs the
 * dump-path pollution filter (HEALER_POLLUTION_FILTER_THRESHOLD).
 *
 * The earlier check on entry->attempted only caught the never-picked
 * case and missed unsupported-syscall noise: landlock_create_ruleset
 * gets picked normally on a no-LANDLOCK kernel, returns ENOSYS, bumps
 * attempted but not successes, and slipped past the filter despite
 * being exactly the noise the filter exists to suppress.  Using
 * successes covers both shapes -- attempted == 0 implies successes ==
 * 0 -- and the outer HEALER_POLLUTION_FILTER_THRESHOLD already gates
 * on the run being mature enough that "no successes yet" means
 * "structurally won't succeed" rather than "warmup".
 *
 * NULL or out-of-range entries are treated as unproductive: a slot
 * the build's syscall table does not even carry cannot meaningfully
 * succeed, and the surrounding filter still requires the total
 * observation threshold before acting on the result.  do32 is left at
 * false to match the call shape print_syscall_name uses everywhere
 * else in the dump path -- HEALER does not separately track 32-bit
 * dispatches.
 */
static bool healer_syscall_unattempted(unsigned int nr)
{
	const struct syscallentry *entry;

	if (nr >= MAX_NR_SYSCALL)
		return true;
	entry = get_syscall_entry(nr, false);
	if (entry == NULL)
		return true;
	return entry->successes == 0;
}

void healer_table_dump(void)
{
	/*
	 * Two parallel top-N pools so the dump can show high-confidence
	 * (any predfreq>0, dynamically observed at least once) and seed-
	 * only (predfreq==0, purely static-seeded) entries in separate
	 * ranked sections.  Without the split, the K=5 denominator bias
	 * leaves un-confirmed seeds at norm=1.500 and they crowd out any
	 * dynamically observed entry whose normalised score has been
	 * dampened by the actual pred-appearance counts -- the seed pool
	 * dominates the combined ranking until dynamic entries accumulate
	 * enough observations to overtake them, which is the opposite of
	 * what the dump should be highlighting.
	 */
	struct healer_dump_entry top_dyn[HEALER_DUMP_TOP_N];
	struct healer_dump_entry top_seed[HEALER_DUMP_TOP_N];
	unsigned int top_dyn_count = 0;
	unsigned int top_seed_count = 0;
	unsigned int occupied = 0;
	unsigned long total_promoted = 0;