|
13 | 13 | #include "nmod_vec.h" |
14 | 14 | #include "nmod_mat.h" |
15 | 15 |
|
| 16 | +#if FLINT_USES_BLAS || (FLINT_BITS == 32) || !defined(__AVX2__) |
| 17 | + |
| 18 | +/* Tuned for Zen 3 with BLAS */ |
| 19 | +static const slong lu_cutoff_tab[64] = { 64, 64, 244, 280, 296, 320, 332, |
| 20 | + 792, 800, 800, 800, 400, 400, 400, 400, 404, 404, 400, 408, 416, 400, |
| 21 | + 412, 424, 408, 484, 1352, 1032, 260, 68, 56, 56, 56, 148, 160, 148, 156, |
| 22 | + 156, 156, 120, 168, 160, 124, 156, 124, 156, 148, 112, 156, 148, 136, |
| 23 | + 156, 160, 116, 136, 168, 148, 156, 160, 120, 128, 68, 64, 104, 104 }; |
| 24 | + |
| 25 | +#else |
| 26 | + |
| 27 | +/* Tuned for Zen 3 without BLAS */ |
| 28 | +static const slong lu_cutoff_tab[64] = { 64, 64, 212, 260, 280, 316, 344, |
| 29 | + 792, 872, 904, 856, 1016, 1136, 1456, 1440, 1464, 1376, 1392, 1448, 1448, |
| 30 | + 1360, 1392, 1400, 1392, 1448, 1416, 1032, 260, 68, 56, 56, 60, 168, 164, |
| 31 | + 152, 152, 156, 148, 148, 148, 152, 164, 160, 148, 164, 148, 164, 160, 160, |
| 32 | + 148, 156, 156, 148, 164, 148, 164, 148, 148, 148, 128, 68, 64, 96, 96 }; |
| 33 | + |
| 34 | +#endif |
| 35 | + |
16 | 36 | slong |
17 | 37 | nmod_mat_lu(slong * P, nmod_mat_t A, int rank_check) |
18 | 38 | { |
19 | | - slong nrows, ncols, n, cutoff; |
20 | | - int bits; |
21 | | - nrows = A->r; |
22 | | - ncols = A->c; |
23 | | - |
| 39 | + slong nrows = A->r, ncols = A->c, n; |
24 | 40 | n = FLINT_MIN(nrows, ncols); |
25 | 41 |
|
26 | | - if (n <= 3) |
| 42 | + if (n <= 3 || (NMOD_BITS(A->mod) > 28 && n <= 7)) |
27 | 43 | { |
28 | 44 | return nmod_mat_lu_classical(P, A, rank_check); |
29 | 45 | } |
30 | 46 | else |
31 | 47 | { |
32 | | - if (n >= 20) |
33 | | - { |
34 | | - bits = NMOD_BITS(A->mod); |
35 | | - |
36 | | - if (bits >= FLINT_BITS - 1) |
37 | | - cutoff = 80; |
38 | | - else if (bits >= FLINT_BITS / 2 - 2) |
39 | | - cutoff = 60; |
40 | | - else if (bits >= FLINT_BITS / 4 - 1) |
41 | | - cutoff = 180; |
42 | | - else |
43 | | - cutoff = 60; |
44 | | - |
45 | | - if (n >= cutoff) |
46 | | - return nmod_mat_lu_recursive(P, A, rank_check); |
47 | | - } |
48 | | - |
49 | | - const dot_params_t params = _nmod_vec_dot_params(n, A->mod); |
50 | | - |
51 | | - // TODO thresholds to re-examine after dot product changes |
52 | | - if (params.method <= _DOT1 // <= 0,1 limb |
53 | | - || (params.method <= _DOT2 && n >= 12) // <= 2 limbs (n >= 12 if exactly 2) |
54 | | - || (params.method > _DOT2 && n >= 20)) // == 3 limbs && n >= 20 |
| 48 | + if (n < lu_cutoff_tab[NMOD_BITS(A->mod) - 1]) |
55 | 49 | return nmod_mat_lu_classical_delayed(P, A, rank_check); |
56 | 50 | else |
57 | | - return nmod_mat_lu_classical(P, A, rank_check); |
| 51 | + return nmod_mat_lu_recursive(P, A, rank_check); |
58 | 52 | } |
59 | 53 | } |
| 54 | + |
0 commit comments