99#include <math.h>
1010#if defined(HAVE_SME )
1111
12+ #if defined(DYNAMIC_ARCH )
13+ #define COMBINE (a ,b ) a ## b
14+ #define COMBINE2 (a ,b ) COMBINE(a,b)
15+ #define SGEMM_PREPROCESS_BASE sgemm_direct_sme1_preprocess
16+ #define SGEMM_PREPROCESS COMBINE2(SGEMM_PREPROCESS_BASE,TS)
17+ #define SGEMM_DIRECT2X2_BASE sgemm_direct_alpha_beta_sme1_2VLx2VL
18+ #define SGEMM_DIRECT2X2 COMBINE2(SGEMM_DIRECT2X2_BASE,TS)
19+ #else
20+ #define SGEMM_PREPROCESS sgemm_direct_sme1_preprocess
21+ #define SGEMM_DIRECT2X2 sgemm_direct_alpha_beta_sme1_2VLx2VL
22+ #endif
23+
1224#if defined(__ARM_FEATURE_SME ) && defined(__clang__ ) && __clang_major__ >= 16
1325#include <arm_sme.h>
1426#endif
1527
1628/* Function prototypes */
17- extern void sgemm_direct_sme1_preprocess (uint64_t nbr , uint64_t nbc ,\
18- const float * restrict a , float * a_mod ) __asm__("sgemm_direct_sme1_preprocess" );
29+ extern void SGEMM_PREPROCESS (uint64_t nbr , uint64_t nbc ,\
30+
31+ const float * restrict a , float * a_mod ) ;
1932
2033/* Function Definitions */
2134static uint64_t sve_cntw () {
@@ -227,7 +240,7 @@ void CNAME (BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\
227240 /* Pre-process the left matrix to make it suitable for
228241 matrix sum of outer-product calculation
229242 */
230- sgemm_direct_sme1_preprocess (N , K , A , A_mod );
243+ SGEMM_PREPROCESS (N , K , A , A_mod );
231244 asm volatile ("" : : :"p0" , "p1" , "p2" , "p3" , "p4" , "p5" , "p6" , "p7" ,
232245 "p8" , "p9" , "p10" , "p11" , "p12" , "p13" , "p14" , "p15" ,
233246 "z0" , "z1" , "z2" , "z3" , "z4" , "z5" , "z6" , "z7" ,
@@ -245,6 +258,8 @@ void CNAME (BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\
245258#else
246259
247260void CNAME (BLASLONG N , BLASLONG K , float alpha , float * __restrict A ,\
248- BLASLONG strideA , float beta , float * __restrict C , BLASLONG strideC ){}
249-
261+ BLASLONG strideA , float beta , float * __restrict C , BLASLONG strideC ){
262+ fprintf (stderr ,"empty ssyrk_direct kernel should never be called\n" );
263+ }
264+
250265#endif
0 commit comments