Skip to content

Commit 77b81af

Browse files
Merge pull request #2640 from fredrik-johansson/n3
Optimize `nmod_mat_lu_classical_delayed`
2 parents 5c74e17 + b0d7cea commit 77b81af

5 files changed

Lines changed: 311 additions & 158 deletions

File tree

src/nmod_mat/lu.c

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,47 +13,42 @@
1313
#include "nmod_vec.h"
1414
#include "nmod_mat.h"
1515

16+
#if FLINT_USES_BLAS || (FLINT_BITS == 32) || !defined(__AVX2__)
17+
18+
/* Tuned for Zen 3 with BLAS */
19+
static const slong lu_cutoff_tab[64] = { 64, 64, 244, 280, 296, 320, 332,
20+
792, 800, 800, 800, 400, 400, 400, 400, 404, 404, 400, 408, 416, 400,
21+
412, 424, 408, 484, 1352, 1032, 260, 68, 56, 56, 56, 148, 160, 148, 156,
22+
156, 156, 120, 168, 160, 124, 156, 124, 156, 148, 112, 156, 148, 136,
23+
156, 160, 116, 136, 168, 148, 156, 160, 120, 128, 68, 64, 104, 104 };
24+
25+
#else
26+
27+
/* Tuned for Zen 3 without BLAS */
28+
static const slong lu_cutoff_tab[64] = { 64, 64, 212, 260, 280, 316, 344,
29+
792, 872, 904, 856, 1016, 1136, 1456, 1440, 1464, 1376, 1392, 1448, 1448,
30+
1360, 1392, 1400, 1392, 1448, 1416, 1032, 260, 68, 56, 56, 60, 168, 164,
31+
152, 152, 156, 148, 148, 148, 152, 164, 160, 148, 164, 148, 164, 160, 160,
32+
148, 156, 156, 148, 164, 148, 164, 148, 148, 148, 128, 68, 64, 96, 96 };
33+
34+
#endif
35+
1636
slong
1737
nmod_mat_lu(slong * P, nmod_mat_t A, int rank_check)
1838
{
19-
slong nrows, ncols, n, cutoff;
20-
int bits;
21-
nrows = A->r;
22-
ncols = A->c;
23-
39+
slong nrows = A->r, ncols = A->c, n;
2440
n = FLINT_MIN(nrows, ncols);
2541

26-
if (n <= 3)
42+
if (n <= 3 || (NMOD_BITS(A->mod) > 28 && n <= 7))
2743
{
2844
return nmod_mat_lu_classical(P, A, rank_check);
2945
}
3046
else
3147
{
32-
if (n >= 20)
33-
{
34-
bits = NMOD_BITS(A->mod);
35-
36-
if (bits >= FLINT_BITS - 1)
37-
cutoff = 80;
38-
else if (bits >= FLINT_BITS / 2 - 2)
39-
cutoff = 60;
40-
else if (bits >= FLINT_BITS / 4 - 1)
41-
cutoff = 180;
42-
else
43-
cutoff = 60;
44-
45-
if (n >= cutoff)
46-
return nmod_mat_lu_recursive(P, A, rank_check);
47-
}
48-
49-
const dot_params_t params = _nmod_vec_dot_params(n, A->mod);
50-
51-
// TODO thresholds to re-examine after dot product changes
52-
if (params.method <= _DOT1 // <= 0,1 limb
53-
|| (params.method <= _DOT2 && n >= 12) // <= 2 limbs (n >= 12 if exactly 2)
54-
|| (params.method > _DOT2 && n >= 20)) // == 3 limbs && n >= 20
48+
if (n < lu_cutoff_tab[NMOD_BITS(A->mod) - 1])
5549
return nmod_mat_lu_classical_delayed(P, A, rank_check);
5650
else
57-
return nmod_mat_lu_classical(P, A, rank_check);
51+
return nmod_mat_lu_recursive(P, A, rank_check);
5852
}
5953
}
54+

0 commit comments

Comments
 (0)