Skip to content

Commit 6546da6

Browse files
authored
Merge branch 'OpenMathLib:develop' into fix_dyn_armv9sme
2 parents e5ffb7c + 3e961c2 commit 6546da6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2902
-2809
lines changed

.github/workflows/c910v.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131

3232
steps:
3333
- name: Checkout repository
34-
uses: actions/checkout@v3
34+
uses: actions/checkout@v4
3535

3636
- name: install build deps
3737
run: |
@@ -40,18 +40,18 @@ jobs:
4040
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev
4141
4242
- name: checkout qemu
43-
uses: actions/checkout@v3
43+
uses: actions/checkout@v4
4444
with:
45-
repository: T-head-Semi/qemu
45+
repository: XUANTIE-RV/qemu
4646
path: qemu
47-
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
47+
ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0
4848

4949
- name: build qemu
5050
run: |
5151
# Force use c910v qemu-user
52-
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
52+
wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch
5353
cd qemu
54-
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
54+
patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch
5555
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
5656
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
5757
make -j$(nproc)

CMakeLists.txt

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -299,23 +299,44 @@ if (USE_OPENMP)
299299
endif()
300300
endif()
301301

302-
# Seems that this hack doesn't required since macOS 11 Big Sur
303-
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
304-
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
305-
if (NOT NOFORTRAN)
306-
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
307-
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
308-
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
309-
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
310-
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
311-
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
312-
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
313-
else ()
314-
set (CMAKE_C_CREATE_SHARED_LIBRARY
315-
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
316-
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
317-
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
318-
endif ()
302+
# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
303+
if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
304+
# Use response files
305+
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
306+
# Always build static library first
307+
if(BUILD_STATIC_LIBS)
308+
set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a")
309+
else()
310+
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
311+
set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a")
312+
endif()
313+
set(CREATE_STATIC_LIBRARY_COMMAND
314+
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' "
315+
"sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ")
316+
if(BUILD_SHARED_LIBS)
317+
add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static)
318+
set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib")
319+
endif()
320+
if(USE_OPENMP)
321+
get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES)
322+
else()
323+
set(OMP_LIB "")
324+
endif()
325+
if(NOT NOFORTRAN)
326+
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
327+
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
328+
if(BUILD_SHARED_LIBS)
329+
set(CMAKE_Fortran_CREATE_SHARED_LIBRARY
330+
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
331+
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'")
332+
endif()
333+
else()
334+
set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
335+
if(BUILD_SHARED_LIBS)
336+
set(CMAKE_C_CREATE_SHARED_LIBRARY
337+
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'")
338+
endif()
339+
endif()
319340
endif()
320341

321342
# Handle MSVC exports

CONTRIBUTORS.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,4 +250,7 @@ In chronological order:
250250

251251
* Ye Tao <ye.tao@arm.com>
252252
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1
253-
* [2025-02-27] Add sbgemv_n_neon kernel
253+
* [2025-02-27] Add sbgemv_n_neon kernel
254+
255+
* Abhishek Kumar <https://github.com/abhishek-iitmadras>
256+
* [2025-04-22] Optimise dot kernel for NEOVERSE V1

cmake/prebuild.cmake

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,15 +1006,15 @@ endif ()
10061006
"#define HAVE_SVE\n"
10071007
"#define ARMV8\n")
10081008
set(SGEMM_UNROLL_M 16)
1009-
set(SGEMM_UNROLL_N 4)
1010-
set(DGEMM_UNROLL_M 8)
1011-
set(DGEMM_UNROLL_N 4)
1012-
set(CGEMM_UNROLL_M 8)
1009+
set(SGEMM_UNROLL_N 8)
1010+
set(DGEMM_UNROLL_M 4)
1011+
set(DGEMM_UNROLL_N 8)
1012+
set(CGEMM_UNROLL_M 2)
10131013
set(CGEMM_UNROLL_N 4)
1014-
set(ZGEMM_UNROLL_M 4)
1014+
set(ZGEMM_UNROLL_M 2)
10151015
set(ZGEMM_UNROLL_N 4)
10161016
set(SYMV_P 16)
1017-
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
1017+
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME")
10181018
file(APPEND ${TARGET_CONF_TEMP}
10191019
"#define L1_CODE_SIZE\t65536\n"
10201020
"#define L1_CODE_LINESIZE\t64\n"
@@ -1249,6 +1249,25 @@ endif ()
12491249
set(ZGEMM_UNROLL_M 2)
12501250
set(ZGEMM_UNROLL_N 4)
12511251
set(SYMV_P 16)
1252+
elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9")
1253+
file(APPEND ${TARGET_CONF_TEMP}
1254+
"#define L1_DATA_SIZE\t32768\n"
1255+
"#define L1_DATA_LINESIZE\t64\n"
1256+
"#define L2_SIZE\t262144\n"
1257+
"#define L2_LINESIZE\t64\n"
1258+
"#define DTB_DEFAULT_ENTRIES\t64\n"
1259+
"#define DTB_SIZE\t4096\n"
1260+
"#define L2_ASSOCIATIVE\t32\n"
1261+
"#define ARMV8\n")
1262+
set(SGEMM_UNROLL_M 4)
1263+
set(SGEMM_UNROLL_N 8)
1264+
set(DGEMM_UNROLL_M 4)
1265+
set(DGEMM_UNROLL_N 8)
1266+
set(CGEMM_UNROLL_M 2)
1267+
set(CGEMM_UNROLL_N 4)
1268+
set(ZGEMM_UNROLL_M 2)
1269+
set(ZGEMM_UNROLL_N 4)
1270+
set(SYMV_P 16)
12521271
elseif ("${TCORE}" STREQUAL "P5600")
12531272
file(APPEND ${TARGET_CONF_TEMP}
12541273
"#define L2_SIZE 1048576\n"
@@ -1409,9 +1428,11 @@ endif ()
14091428
# GetArch_2nd
14101429
foreach(float_char S;D;Q;C;Z;X)
14111430
if (NOT DEFINED ${float_char}GEMM_UNROLL_M)
1431+
message(STATUS "setting unrollm=2")
14121432
set(${float_char}GEMM_UNROLL_M 2)
14131433
endif()
14141434
if (NOT DEFINED ${float_char}GEMM_UNROLL_N)
1435+
message(STATUS "setting unrolln=2")
14151436
set(${float_char}GEMM_UNROLL_N 2)
14161437
endif()
14171438
endforeach()

cpuid_arm64.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,15 +374,20 @@ int detect(void)
374374
}
375375
#else
376376
#ifdef __APPLE__
377+
length64 = sizeof(value64);
377378
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0);
378379
cpulowperf=value64;
380+
length64 = sizeof(value64);
379381
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0);
380382
if (value64 > 1) {
381-
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0);
383+
length64 = sizeof(value64);
384+
sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0);
382385
cpuhiperf=value64;
383-
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0);
386+
length64 = sizeof(value64);
387+
sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0);
384388
cpulowperf=value64;
385389
}
390+
length64 = sizeof(value64);
386391
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
387392
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
388393
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
@@ -467,6 +472,7 @@ int n=0;
467472
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
468473
#endif
469474
#ifdef __APPLE__
475+
length64 = sizeof(value64);
470476
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
471477
printf("#define NUM_CORES %d\n",value);
472478
if (cpulowperf >0)
@@ -698,12 +704,17 @@ void get_cpuconfig(void)
698704
case CPU_VORTEX:
699705
printf("#define VORTEX \n");
700706
#ifdef __APPLE__
707+
length64 = sizeof(value64);
701708
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
702709
printf("#define L1_CODE_SIZE %lld \n",value64);
710+
length64 = sizeof(value64);
703711
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
704712
printf("#define L1_CODE_LINESIZE %lld \n",value64);
713+
printf("#define L1_DATA_LINESIZE %lld \n",value64);
714+
length64 = sizeof(value64);
705715
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
706716
printf("#define L1_DATA_SIZE %lld \n",value64);
717+
length64 = sizeof(value64);
707718
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
708719
printf("#define L2_SIZE %lld \n",value64);
709720
#endif

cpuid_x86.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,6 +1578,7 @@ int get_cpuname(void){
15781578
case 12: //family 6 exmodel 12
15791579
switch (model) {
15801580
case 15:
1581+
case 6: // Arrow Lake
15811582
if(support_avx512())
15821583
return CPUTYPE_SAPPHIRERAPIDS;
15831584
if(support_avx2())
@@ -2421,6 +2422,22 @@ int get_coretype(void){
24212422
else
24222423
return CORE_NEHALEM;
24232424
}
2425+
case 12:
2426+
switch (model) {
2427+
case 6: // Arrow Lake
2428+
if(support_amx_bf16())
2429+
return CORE_SAPPHIRERAPIDS;
2430+
if(support_avx512_bf16())
2431+
return CORE_COOPERLAKE;
2432+
if(support_avx512())
2433+
return CORE_SKYLAKEX;
2434+
if(support_avx2())
2435+
return CORE_HASWELL;
2436+
if(support_avx())
2437+
return CORE_SANDYBRIDGE;
2438+
else
2439+
return CORE_NEHALEM;
2440+
}
24242441
}
24252442
case 15:
24262443
if (model <= 0x2) return CORE_NORTHWOOD;

kernel/arm64/KERNEL.A64FX

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
include $(KERNELDIR)/KERNEL.ARMV8SVE
22

3-
SGEMVNKERNEL = gemv_n_sve.c
4-
DGEMVNKERNEL = gemv_n_sve.c
3+
SGEMVNKERNEL = gemv_n_sve_v4x3.c
4+
DGEMVNKERNEL = gemv_n_sve_v4x3.c
55
SGEMVTKERNEL = gemv_t_sve_v4x3.c
66
DGEMVTKERNEL = gemv_t_sve_v4x3.c

kernel/arm64/KERNEL.ARMV8SVE

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ DSCALKERNEL = scal.S
7474
CSCALKERNEL = zscal.S
7575
ZSCALKERNEL = zscal.S
7676

77-
SGEMVNKERNEL = gemv_n_sve.c
78-
DGEMVNKERNEL = gemv_n.S
77+
SGEMVNKERNEL = gemv_n_sve_v1x3.c
78+
DGEMVNKERNEL = gemv_n_sve_v1x3.c
7979
CGEMVNKERNEL = zgemv_n.S
8080
ZGEMVNKERNEL = zgemv_n.S
8181

@@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c
8484
CGEMVTKERNEL = zgemv_t.S
8585
ZGEMVTKERNEL = zgemv_t.S
8686

87+
SSYMV_L_KERNEL = symv_L_sve_v1x4.c
88+
SSYMV_U_KERNEL = symv_U_sve_v1x4.c
89+
DSYMV_L_KERNEL = symv_L_sve_v1x4.c
90+
DSYMV_U_KERNEL = symv_U_sve_v1x4.c
91+
8792
SASUMKERNEL = sasum_thunderx2t99.c
8893
DASUMKERNEL = dasum_thunderx2t99.c
8994
CASUMKERNEL = casum_thunderx2t99.c

kernel/arm64/KERNEL.NEOVERSEN1

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ DSCALKERNEL = scal.S
6060
CSCALKERNEL = zscal.S
6161
ZSCALKERNEL = zscal.S
6262

63-
SGEMVNKERNEL = gemv_n.S
63+
SGEMVNKERNEL = sgemv_n_neon.c
6464
DGEMVNKERNEL = gemv_n.S
6565
CGEMVNKERNEL = zgemv_n.S
6666
ZGEMVNKERNEL = zgemv_n.S
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S
7070
CGEMVTKERNEL = zgemv_t.S
7171
ZGEMVTKERNEL = zgemv_t.S
7272

73+
SSYMV_L_KERNEL = symv_L_asimd_4x4.c
74+
SSYMV_U_KERNEL = symv_U_asimd_4x4.c
75+
DSYMV_L_KERNEL = symv_L_asimd_4x4.c
76+
DSYMV_U_KERNEL = symv_U_asimd_4x4.c
7377

7478
SASUMKERNEL = sasum_thunderx2t99.c
7579
DASUMKERNEL = dasum_thunderx2t99.c

kernel/arm64/KERNEL.NEOVERSEN2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ DSCALKERNEL = scal.S
6060
CSCALKERNEL = zscal.S
6161
ZSCALKERNEL = zscal.S
6262

63-
SGEMVNKERNEL = gemv_n.S
64-
DGEMVNKERNEL = gemv_n.S
63+
SGEMVNKERNEL = gemv_n_sve_v1x3.c
64+
DGEMVNKERNEL = gemv_n_sve_v1x3.c
6565
CGEMVNKERNEL = zgemv_n.S
6666
ZGEMVNKERNEL = zgemv_n.S
6767

0 commit comments

Comments
 (0)