VectorFFT/src/core/factorizer.h at main · Tugbars/VectorFFT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
/**
 * stride_factorizer.h — CPU-aware factorizer for stride-based FFT executor
 *
 * Given N, decomposes into available radixes with cache-aware ordering.
 *
 * Design:
 *   1. Find all possible radix decompositions of N
 *   2. For each decomposition, score based on cache behavior
 *   3. Pick the best, or try all orderings (auto-tune mode)
 *
 * Cache model:
 *   Stage s processes groups of R_s butterflies at stride S_s.
 *   Each butterfly touches R_s elements separated by S_s doubles.
 *   Working set per group = R_s * S_s * 16 bytes (split complex).
 *   Twiddle table per stage = (R_s - 1) * K * 16 bytes.
 *
 *   Optimal: working set fits L1 for all stages.
 *   Reality: outer stages exceed L1 → minimize damage.
 */
#ifndef STRIDE_FACTORIZER_H
#define STRIDE_FACTORIZER_H

#include "registry.h"
#include "generated/radix_profile.h"  /* per-radix arithmetic + register-pressure
                                       * profile (auto-generated by
                                       * tools/radix_profile/extract.py) */
#include "generated/radix_cpe.h"      /* per-radix per-variant cycles-per-
                                       * butterfly (auto-generated by
                                       * tools/radix_profile/measure_cpe.c) */
#include "wisdom_bridge.h"            /* stride_prefer_dit_log3, stride_prefer_t1s
                                       * — used by the cost model to mirror
                                       * the variant selection that
                                       * _stride_build_plan does at plan time. */
#include <string.h>
#include <math.h>

#define FACT_MAX_STAGES 9
#define FACT_MAX_DECOMPOSITIONS 64

/* ═══════════════════════════════════════════════════════════════
 * CPU CACHE DETECTION
 * ═══════════════════════════════════════════════════════════════ */

typedef struct {
    size_t l1d_bytes;    /* L1 data cache size */
    size_t l2_bytes;     /* L2 cache size */
    size_t cache_line;   /* cache line size (bytes) */
} stride_cpu_info_t;

#ifdef _WIN32
#include <intrin.h>
#endif

static stride_cpu_info_t stride_detect_cpu(void) {
    stride_cpu_info_t info = {48 * 1024, 2 * 1024 * 1024, 64}; /* defaults */

#if defined(_WIN32) && (defined(_MSC_VER) || defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER))
    /* CPUID leaf 4: deterministic cache parameters */
    for (int sub = 0; sub < 8; sub++) {
        int cpuinfo[4] = {0};
        __cpuidex(cpuinfo, 4, sub);
        int type = cpuinfo[0] & 0x1F;
        if (type == 0) break; /* no more caches */
        int level = (cpuinfo[0] >> 5) & 0x7;
        size_t ways  = (size_t)(((unsigned)cpuinfo[1] >> 22) & 0x3FF) + 1;
        size_t parts = (size_t)(((unsigned)cpuinfo[1] >> 12) & 0x3FF) + 1;
        size_t line  = (size_t)((unsigned)cpuinfo[1] & 0xFFF) + 1;
        size_t sets  = (size_t)(unsigned)cpuinfo[2] + 1;
        size_t sz = ways * parts * line * sets;
        if (level == 1 && (type == 1 || type == 3)) info.l1d_bytes = sz; /* data or unified */
        if (level == 2) info.l2_bytes = sz;
        info.cache_line = line;
    }
#elif defined(__linux__)
    {
        long l1 = sysconf(_SC_LEVEL1_DCACHE_SIZE);
        long l2 = sysconf(_SC_LEVEL2_CACHE_SIZE);
        if (l1 > 0) info.l1d_bytes = (size_t)l1;
        if (l2 > 0) info.l2_bytes = (size_t)l2;
    }
#endif
    return info;
}

/* ═══════════════════════════════════════════════════════════════
 * FACTORIZATION RESULT
 * ═══════════════════════════════════════════════════════════════ */

typedef struct {
    int factors[FACT_MAX_STAGES];
    int nfactors;
} stride_factorization_t;

/* ═══════════════════════════════════════════════════════════════
 * GREEDY FACTORIZER — composite-preferring
 *
 * Step 1: Decompose N into available radixes, preferring composites
 *         (R=25,20,12,10) over equivalent small-factor decompositions.
 *         This minimizes stage count.
 *
 * Step 2: Order factors based on K.
 *         Small K (<=16): descending (largest first) — everything fits L1.
 *         Large K (>16):  ascending (smallest first) — reduce stride pressure
 *                         on outer stages, put big radix last (stride=K, sequential).
 *
 * Step 3: Push R=64 away from first stage (its 2225-op codelet has too many
 *         compiler spills to be efficient as the first pass).
 * ═══════════════════════════════════════════════════════════════ */

/* Radixes sorted by preference for decomposition: composites first to absorb factors */
static const int FACTORIZE_RADIXES[] = {
    25, 20, 12, 10, 32, 16, 8, 7, 6, 5, 4, 3, 2,
    64, /* R=64 last in decomposition preference — only if needed */
    19, 17, 13, 11, /* odd primes last */
    0
};

static void _sort_factors_ascending(int *f, int n) {
    for (int i = 1; i < n; i++) {
        int key = f[i]; int j = i - 1;
        while (j >= 0 && f[j] > key) { f[j+1] = f[j]; j--; }
        f[j+1] = key;
    }
}

static void _sort_factors_descending(int *f, int n) {
    for (int i = 1; i < n; i++) {
        int key = f[i]; int j = i - 1;
        while (j >= 0 && f[j] < key) { f[j+1] = f[j]; j--; }
        f[j+1] = key;
    }
}

/* Check if a radix's twiddle table overflows half the L1 at this K.
 *
 * Twiddle table size for one twiddled stage: (R-1) * K * 16 bytes.
 * Budget = L1 / 2: twiddles get half, data + codelet spills get half.
 *
 * When nothing fits within L1/2, the fallback in the greedy loop picks
 * the largest available radix anyway (fewer stages > perfect caching).
 * This two-tier approach avoids both extremes:
 *   - L1/3 was too strict: rejected R=8 at K=256, caused stage explosion
 *   - Full L1 was too loose: left no cache for data on smaller L1 CPUs */
static int _tw_overflows_l1(int R, size_t K, size_t l1_bytes) {
    size_t tw_budget = l1_bytes / 2;
    size_t tw_bytes = (size_t)(R - 1) * K * 16;
    return tw_bytes > tw_budget;
}

static int stride_factorize_greedy(int N, size_t K,
                                   const stride_registry_t *reg,
                                   const stride_cpu_info_t *cpu,
                                   stride_factorization_t *fact) {
    memset(fact, 0, sizeof(*fact));
    if (N <= 1) return 0;

    const size_t l1 = cpu->l1d_bytes;
    int remaining = N;
    int nf = 0;

    /* Decompose N into available radixes.
     *
     * Strategy: prefer large composites (fewer stages) unless their
     * twiddle table exceeds L1. When nothing fits, pick the LARGEST
     * available radix anyway — extra cache misses are cheaper than
     * extra stages (each stage is a full pass over all N*K data). */
    while (remaining > 1 && nf < FACT_MAX_STAGES) {
        int best_R = 0;

        /* Pass 1: find largest radix whose twiddles fit L1 */
        for (const int *rp = FACTORIZE_RADIXES; *rp; rp++) {
            int R = *rp;
            if (remaining % R != 0) continue;
            if (!stride_registry_has(reg, R)) continue;
            if (nf > 0 && _tw_overflows_l1(R, K, l1)) continue;
            best_R = R;
            break;
        }

        /* Pass 2: if nothing fits L1, pick largest available anyway.
         * Fewer stages > perfect cache behavior. The twiddle data is
         * accessed sequentially so L2 streaming works acceptably. */
        if (!best_R) {
            for (const int *rp = FACTORIZE_RADIXES; *rp; rp++) {
                int R = *rp;
                if (remaining % R != 0) continue;
                if (!stride_registry_has(reg, R)) continue;
                best_R = R;
                break;  /* FACTORIZE_RADIXES is largest-first */
            }
        }

        if (!best_R) return -1;
        fact->factors[nf++] = best_R;
        remaining /= best_R;
    }
    if (remaining != 1) return -1;
    fact->nfactors = nf;

    /* Order based on K:
     * Small K (<=16): descending — large radixes first, everything fits L1.
     * Large K (>16): ascending — small radixes first (small stride on outer
     *                stages), large radixes last (stride=K, sequential). */
    if (K <= 16) {
        _sort_factors_descending(fact->factors, nf);
    } else {
        _sort_factors_ascending(fact->factors, nf);
    }

    /* Push R=64 away from first stage: its 2225-op codelet has too many
     * compiler spills. Better as innermost (stride=K, sequential). */
    if (nf > 1 && fact->factors[0] >= 64) {
        int tmp = fact->factors[0];
        for (int i = 0; i < nf - 1; i++) fact->factors[i] = fact->factors[i+1];
        fact->factors[nf-1] = tmp;
    }

    return 0;
}

/* ═══════════════════════════════════════════════════════════════
 * SCORE A FACTORIZATION ORDER
 *
 * Lower score = better. Estimates total cache misses.
 *
 * For each stage s (processed in order 0, 1, ..., nf-1):
 *   stride_s = K * product of factors[s+1 .. nf-1]
 *   working_set_s = R_s * stride_s * 16 bytes
 *   groups_s = N / R_s
 *
 * If working_set fits L1: cost = groups * R * K (pure arithmetic)
 * If working_set fits L2: cost = groups * R * K * 2 (L2 latency penalty)
 * If exceeds L2:          cost = groups * R * K * 8 (L3/DRAM penalty)
 *
 * Twiddle overhead:
 *   For stage 0: no twiddle
 *   For stages 1+: tw_size = (R-1) * K_accumulated * 16
 *   If tw_size > L1: add penalty (twiddle misses)
 * ═══════════════════════════════════════════════════════════════ */

/* Per-butterfly cycle cost lookup. Reads the auto-generated radix_cpe.h
 * (produced by tools/radix_profile/measure_cpe.c on the calibration host).
 *
 * Returns 0.0 when the slot isn't populated (codelet not registered for
 * this ISA, or measure_cpe didn't see it) — caller falls back.
 *
 * variant_idx: 0=n1, 1=t1, 2=t1s, 3=log3. */
#define _CPE_VARIANT_N1   0
#define _CPE_VARIANT_T1   1
#define _CPE_VARIANT_T1S  2
#define _CPE_VARIANT_LOG3 3

static inline double _radix_cpe_lookup(int R, int variant_idx, int isa_avx512) {
    if (R <= 0 || R >= STRIDE_RADIX_PROFILE_MAX_R) return 0.0;
    const stride_radix_cpe_t *t = isa_avx512 ? &stride_radix_cpe_avx512[R]
                                              : &stride_radix_cpe_avx2[R];
    switch (variant_idx) {
        case _CPE_VARIANT_N1:   return t->cyc_n1;
        case _CPE_VARIANT_T1:   return t->cyc_t1;
        case _CPE_VARIANT_T1S:  return t->cyc_t1s;
        case _CPE_VARIANT_LOG3: return t->cyc_log3;
        default:                return 0.0;
    }
}

/* Per-butterfly cost for a known (R, variant) pair. Falls back from CPE
 * to ops/SIMD when the empirical slot is empty. */
static inline double _radix_butterfly_cost_variant(
    const stride_radix_profile_t *p, int R, int variant_idx, int isa_avx512)
{
    double cpe = _radix_cpe_lookup(R, variant_idx, isa_avx512);
    if (cpe > 0.0) return cpe;

    int ops = _stride_total_ops(p);
    if (ops == 0) return 0.0;
    int simd_width = isa_avx512 ? 8 : 4;
    return (double)ops / (double)simd_width;
}

/* Per-butterfly cost for radix R, mirroring the variant selection that
 * _stride_build_plan does at plan-construction time.
 *
 * Plan-build's order ([planner.h:_stride_build_plan]):
 *   1. stage 0 → always n1 (no twiddle on outermost DIT pass)
 *   2. stage 1+:
 *        a. if stride_prefer_dit_log3(R, me, ios) AND log3 codelet exists:
 *             attach t1_fwd_log3, set stage_skip_t1s
 *        b. else if stride_prefer_buf(R, me, ios) AND buf codelet exists:
 *             attach t1_buf_fwd, set stage_skip_t1s    (currently inert —
 *             the predicate is wired but rarely fires for the bench radixes)
 *        c. else: attach plain t1_fwd
 *        d. post-pass: if !stage_skip_t1s AND stride_prefer_t1s(R, me, ios)
 *           AND t1s codelet exists: attach t1s_fwd ALONGSIDE t1_fwd. The
 *           executor's runtime preference picks t1s when both are attached.
 *
 * Net runtime variant ranking (highest priority first): log3, t1s, t1.
 * (buf is inert today — handled by t1 fallback.)
 *
 * The cost model returns the cycle count of whichever variant the executor
 * will actually run. This makes estimate-mode plan scores match the wall-
 * clock cost of what the same plan does in the executor — no drift between
 * cost model and runtime selection. */
static inline double _radix_butterfly_cost(int R, int stage_idx,
                                           size_t me, size_t ios,
                                           int isa_avx512)
{
    if (R <= 0 || R >= STRIDE_RADIX_PROFILE_MAX_R) {
        /* Unknown radix — assume linear in R as a fallback. */
        return (double)R * 4.0;
    }

    /* Stage 0: always n1. */
    if (stage_idx == 0) {
        const stride_radix_profile_t *p =
            isa_avx512 ? &stride_radix_profile_n1_avx512[R]
                       : &stride_radix_profile_n1_avx2[R];
        double c = _radix_butterfly_cost_variant(p, R, _CPE_VARIANT_N1, isa_avx512);
        return (c > 0.0) ? c : (double)R * 4.0;
    }

    /* Stage 1+: ask wisdom_bridge what variant the executor will pick.
     * We mirror _stride_build_plan's order exactly. Each branch falls
     * through to the next on a missed-measurement so we never return 0.
     *
     * Note: stride_prefer_* return 0 for any R without per-radix wisdom
     * (e.g. R=2 has no plan_wisdom). Those R fall straight to t1 — same
     * as the executor's behaviour, since plan-build won't attach t1s/log3
     * without a wisdom-confirmed predicate. */
    if (stride_prefer_dit_log3(R, me, ios)) {
        double c = _radix_cpe_lookup(R, _CPE_VARIANT_LOG3, isa_avx512);
        if (c > 0.0) return c;
        /* log3 not measured for this radix — fall through to t1s/t1 */
    }
    if (stride_prefer_t1s(R, me, ios)) {
        double c = _radix_cpe_lookup(R, _CPE_VARIANT_T1S, isa_avx512);
        if (c > 0.0) return c;
        /* t1s not measured (e.g. R=2) — fall through to t1 */
    }

    /* Default flat t1. Use the t1 profile slot, falling back to n1's
     * profile (some radixes are n1-only — e.g. R=3 has no t1 slot).
     * The CPE-table lookup independently tries cyc_t1 first. */
    const stride_radix_profile_t *p =
        isa_avx512 ? &stride_radix_profile_t1_avx512[R]
                   : &stride_radix_profile_t1_avx2[R];
    if (_stride_total_ops(p) == 0) {
        p = isa_avx512 ? &stride_radix_profile_n1_avx512[R]
                       : &stride_radix_profile_n1_avx2[R];
    }
    double c = _radix_butterfly_cost_variant(p, R, _CPE_VARIANT_T1, isa_avx512);
    return (c > 0.0) ? c : (double)R * 4.0;
}

static double stride_score_factorization(const int *factors, int nf, size_t K,
                                         int N, const stride_cpu_info_t *cpu) {
    const size_t l1 = cpu->l1d_bytes;
    const size_t l2 = cpu->l2_bytes;
    /* ISA detection from compile-time defines — matches build.py's flags. */
#if defined(__AVX512F__)
    const int isa_avx512 = 1;
#else
    const int isa_avx512 = 0;
#endif
    double score = 0.0;

    size_t accumulated_K = K;

    for (int s = 0; s < nf; s++) {
        int R = factors[s];
        int groups = N / R;

        /* Stride for this stage — distance between butterfly legs in
         * doubles. Matches what _stride_ios_at_stage(K, factors, nf, s)
         * computes inside _stride_build_plan, so the cost model and the
         * planner consult wisdom_bridge with the SAME (R, me, ios) tuple. */
        size_t stride = K;
        for (int d = s + 1; d < nf; d++) stride *= factors[d];

        /* Variant-aware cost. me=K, ios=stride. */
        double bf_cost = _radix_butterfly_cost(R, s, /*me=*/K, /*ios=*/stride,
                                                isa_avx512);

        /* Working set per group */
        size_t ws = (size_t)R * stride * 16;

        /* Cache-fit penalty multiplier. Same thresholds as before. */
        double cache_factor;
        if (ws <= l1)
            cache_factor = 1.0;
        else if (ws <= l2)
            cache_factor = 3.0;
        else
            cache_factor = 10.0;

        /* Data access cost weighted by per-radix butterfly cost.
         * groups * K butterflies happen in this stage; each costs bf_cost. */
        double data_cost = (double)groups * (double)K * bf_cost * cache_factor;

        /* Twiddle cost (stages 1+) — unchanged. The twiddle multiplications
         * are already counted inside bf_cost (the codelet's mul/fma ops),
         * so what we add here is just the *memory traffic* for the twiddle
         * table, which the codelet doesn't see at decode time. */
        double tw_cost = 0.0;
        if (s > 0) {
            size_t tw_bytes = (size_t)(R - 1) * accumulated_K * 16;
            if (tw_bytes > l1)
                tw_cost = (double)(R - 1) * accumulated_K * 4.0;
            else
                tw_cost = (double)(R - 1) * accumulated_K;
        }

        score += data_cost + tw_cost;
        accumulated_K *= R;
    }

    return score;
}

/* ═══════════════════════════════════════════════════════════════
 * PERMUTATION SEARCH
 *
 * Given a set of radixes (from greedy factorizer), try all orderings
 * and pick the one with best score. For nf <= 4 (24 permutations)
 * this is instant. For nf=5 (120) still fast.
 *
 * For nf > 5, use greedy order only (720+ permutations too slow
 * for heuristic scoring; use auto-tune with real benchmarks instead).
 * ═══════════════════════════════════════════════════════════════ */

static void _permute_and_score(int *arr, int n, int depth,
                               size_t K, int N, const stride_cpu_info_t *cpu,
                               int *best_order, double *best_score) {
    if (depth == n) {
        double sc = stride_score_factorization(arr, n, K, N, cpu);
        if (sc < *best_score) {
            *best_score = sc;
            memcpy(best_order, arr, n * sizeof(int));
        }
        return;
    }
    for (int i = depth; i < n; i++) {
        /* swap arr[depth] and arr[i] */
        int tmp = arr[depth]; arr[depth] = arr[i]; arr[i] = tmp;
        _permute_and_score(arr, n, depth + 1, K, N, cpu, best_order, best_score);
        /* re-swap (swap is involution) — restores both positions */
        tmp = arr[depth]; arr[depth] = arr[i]; arr[i] = tmp;
    }
}

static void stride_optimize_order(stride_factorization_t *fact, size_t K,
                                  int N, const stride_cpu_info_t *cpu) {
    if (fact->nfactors <= 1) return;

    /* For large nf, skip permutation search */
    if (fact->nfactors > 5) return;

    int work[FACT_MAX_STAGES];
    int best[FACT_MAX_STAGES];
    double best_score = 1e30;

    memcpy(work, fact->factors, fact->nfactors * sizeof(int));
    memcpy(best, fact->factors, fact->nfactors * sizeof(int));

    _permute_and_score(work, fact->nfactors, 0, K, N, cpu, best, &best_score);

    memcpy(fact->factors, best, fact->nfactors * sizeof(int));
}

/* ═══════════════════════════════════════════════════════════════
 * SCORED DECOMPOSITION SEARCH
 *
 * Enumerate ALL ordered factorizations of N using the available radix set
 * (up to FACT_MAX_STAGES factors), score each with stride_score_factorization
 * (which uses the per-radix arithmetic + register-pressure profile from
 * radix_profile.h), and return the lowest-scoring one.
 *
 * This replaces the greedy "biggest-radix-first" heuristic for the
 * cost-model-driven plan path (stride_estimate_plan / VFFT_ESTIMATE).
 * Greedy is preserved for legacy stride_factorize callers.
 *
 * Search space: bounded by min(FACT_MAX_STAGES, log2(N)) recursion depth
 * times the available radix count (~18). For typical N ≤ 16384 this is
 * dozens to hundreds of candidates — sub-millisecond evaluation total.
 * ═══════════════════════════════════════════════════════════════ */

typedef struct {
    int factors[FACT_MAX_STAGES];        /* current path */
    int best_factors[FACT_MAX_STAGES];
    int best_nf;
    double best_score;
    int N;
    size_t K;
    const stride_registry_t *reg;
    const stride_cpu_info_t *cpu;
} _stride_decomp_search_t;

static void _stride_decomp_search_recurse(_stride_decomp_search_t *s,
                                          int remaining, int nf)
{
    if (remaining == 1) {
        if (nf == 0) return;  /* trivial N=1 case — no factors */
        double sc = stride_score_factorization(s->factors, nf,
                                               s->K, s->N, s->cpu);
        if (sc < s->best_score) {
            s->best_score = sc;
            s->best_nf = nf;
            memcpy(s->best_factors, s->factors, (size_t)nf * sizeof(int));
        }
        return;
    }
    if (nf >= FACT_MAX_STAGES) return;

    /* Try each radix R in the registry that divides `remaining`.
     * Skip R=1 (degenerate) — every other R≥2 that has an n1 codelet. */
    for (int R = 2; R < STRIDE_REG_MAX_RADIX; R++) {
        if (remaining % R != 0) continue;
        if (!s->reg->n1_fwd[R]) continue;  /* no codelet for this radix */
        s->factors[nf] = R;
        _stride_decomp_search_recurse(s, remaining / R, nf + 1);
    }
}

/* Top-level entry: returns 0 on success (fact filled), 1 if N is not
 * fully decomposable into the available radix set (caller should fall
 * back to Bluestein/Rader for prime / non-smooth N). */
static int stride_factorize_scored(int N, size_t K,
                                   const stride_registry_t *reg,
                                   const stride_cpu_info_t *cpu,
                                   stride_factorization_t *fact)
{
    memset(fact, 0, sizeof(*fact));
    if (N <= 1) return 0;

    _stride_decomp_search_t s;
    s.best_nf    = 0;
    s.best_score = 1e30;
    s.N          = N;
    s.K          = K;
    s.reg        = reg;
    s.cpu        = cpu;

    _stride_decomp_search_recurse(&s, N, 0);

    if (s.best_nf == 0) return 1;  /* no decomposition found */

    fact->nfactors = s.best_nf;
    memcpy(fact->factors, s.best_factors,
           (size_t)s.best_nf * sizeof(int));
    return 0;
}


/* ═══════════════════════════════════════════════════════════════
 * TOP-LEVEL FACTORIZER (legacy — greedy heuristic only)
 *
 * 1. Greedy decomposition (finds the radix SET)
 * 2. Permutation search (finds the best ORDER) — currently disabled
 *
 * For the cost-model-driven path use stride_factorize_scored instead.
 * ═══════════════════════════════════════════════════════════════ */

static int stride_factorize(int N, size_t K,
                            const stride_registry_t *reg,
                            stride_factorization_t *fact) {
    stride_cpu_info_t cpu = stride_detect_cpu();

    /* Greedy decomposition with K-aware ordering (v2).
     * The greedy factorizer already sorts based on K:
     *   small K → descending, large K → ascending.
     * No additional permutation search — the scoring model is unreliable
     * and the swap-based permuter has corruption bugs.
     * Use exhaustive search (wisdom) for truly optimal ordering. */
    return stride_factorize_greedy(N, K, reg, &cpu, fact);
}

/* ═══════════════════════════════════════════════════════════════
 * LOG3 TWIDDLE DERIVATION — DISABLED
 *
 * Log3 codelets derive twiddle factors from a few base values via
 * cmul chains, reducing L1 cache pressure at large K. The codelets
 * exist in the registry (t1_fwd_log3/t1_bwd_log3) and are correct
 * (220/220 tests passed), but benchmarking on i9-14900KF (hybrid
 * P/E-core) showed gains too inconsistent to trust:
 *   - Wins at one K, loses at adjacent K (spiky, not monotonic)
 *   - Results flip between runs (R=19: NEVER → always wins → NEVER)
 *   - Only R=12 K=1024 showed consistent >15% gains
 *
 * Attempted approaches that didn't resolve the inconsistency:
 *   1. Physics-based estimate model (codelet profiles: spill_bytes,
 *      bf_flops, n_bases, n_derived → L1 capacity / register pressure
 *      threshold). Matched 11/16 radixes within 2x but couldn't
 *      predict genfft primes (DAG-optimized, Sethi-Ullman scheduled).
 *   2. Interleaved calibration (shared buffer, alternating batch order,
 *      7 rounds median, 15% margin, two-pass confirmation). Still
 *      produced run-to-run variance on hybrid CPU architecture.
 *
 * The log3 infrastructure (codelets, executor support, calibration)
 * remains in the codebase. To re-enable on stable hardware (e.g.,
 * server with isolated cores), restore the threshold logic and
 * add back the codelet profile table + estimate model that was
 * removed in this cleanup.
 * ===================================================================== */


#endif /* STRIDE_FACTORIZER_H */