Skip to content

Commit a8ffcea

Browse files
authored
Merge pull request #5773 from OpenMathLib/develop
Merge from develop for 0.3.33 release
2 parents dbad3e1 + b0a0364 commit a8ffcea

315 files changed

Lines changed: 20214 additions & 11274 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.cirrus.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,16 @@ task:
8989
type: text/plain
9090

9191
macos_instance:
92-
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
92+
image: ghcr.io/cirruslabs/macos-tahoe-xcode:latest
9393
task:
9494
name: AppleM1/LLVM armv7-androidndk xbuild
9595
compile_script:
9696
- brew install --cask android-ndk
9797
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
9898
- export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
99-
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
99+
- export AR=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar
100+
- export RANLIB=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ranlib
101+
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
100102
always:
101103
config_artifacts:
102104
path: "*conf*"

CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,8 +309,8 @@ if (USE_OPENMP)
309309
endif()
310310

311311
# Fix "Argument list too long" for macOS - mostly seen with older OS versions on POWERPC or Intel CPUs
312-
if(APPLE)
313-
# Use response files
312+
if(APPLE AND "${CMAKE_GENERATOR}" MATCHES ".*Makefiles")
313+
# Use response files to get around the ARG_MAX limit, unless using the Ninja generator
314314
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
315315
# Always build static library first
316316
if(BUILD_STATIC_LIBS)
@@ -333,7 +333,7 @@ if(APPLE)
333333
endif()
334334
if(NOT NOFORTRAN)
335335
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
336-
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
336+
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
337337
if(BUILD_SHARED_LIBS)
338338
set(CMAKE_Fortran_CREATE_SHARED_LIBRARY
339339
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"

CONTRIBUTORS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,6 @@ In chronological order:
275275

276276
* Fadi Arafeh <fadi.arafeh@arm.com>
277277
* [2026-03-05] Accelerate SVE128 SBGEMM/BGEMM
278+
279+
* Nathan Sircombe <nathan.sircombe@arm.com>
280+
* [2026-04-16] Add CPU ID for Neoverse V3

Changelog.txt

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,54 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.33
4+
23-Apr-2026
5+
6+
general:
7+
- fixed an incorrect cast in the SBGEMM test case that could lead to spurious test failures
8+
- fixed an invalid memory access in the converted C version of the CBLAS tests
9+
- made the BIGNUMA setting automatic when the number of cores exceeds 256
10+
- Imported recent updates from Reference-LAPACK to realign with its upcoming 3.13.0 release:
11+
- Implement ?LARF1F and ?ORM2R (Reference-LAPACK PRs 1019,1020,1196,1257)
12+
- Change loop order in ?GETC2 to improve performance (Reference-LAPACK PR 1023)
13+
- Change WORK array dimension in ?GELQS/?GEQRS (Reference-LAPACK PR 1094)
14+
- Add NaN checks for input matrix A in ?GEEV (Reference-LAPACK PR 1136)
15+
- Fix support for jobu/v in LAPACKE_?GESVDQ_WORK (Reference-LAPACK PRs 1146,1221)
16+
- Fix display of version number in LAPACK testsuite (Reference-LAPACK PR 1149)
17+
- Fix DGGES test seed to avoid bad matrix cases (Reference-LAPACK PR 1187)
18+
- Fix truncation of large WORK array sizes in ZHE (Reference-LAPACK PR 1195)
19+
- Fix overwriting of LDSWORK parameter in ?TRSYL3 (Reference-LAPACK PR 1206)
20+
- Fix overwriting of error states in some EIG tests (Reference-LAPACK PR 1207)
21+
- Remove unused parameter in DORBDB3/ZUNBDB3 (Reference-LAPACK PR 1209)
22+
- Re-enable testing of ?BB and ?GG driver functions (Reference-LAPACK PR 1211)
23+
- Fix workspace size calculation in ?TGSEN (Reference-LAPACK PR 774)
24+
- Fix typos in the EIG DMD tests and initialized the cutoff variable (PR 1212,1228)
25+
- Optimized looping in ?LACPY/?LASCL/?LANTR with fat matrix and UPLO=L (PR 1251)
26+
27+
arm64:
28+
- worked around a serious miscompilation of the DDOT kernel by GCC15, affecting
29+
most non-SVE targets, and SVE targets in the case of non-unit array stride)
30+
- fixed an accuracy issue in the GEMV kernel for Neoverse V1 and other SVE targets
31+
- fixed broken STRMM and SSYMM in DYNAMIC_ARCH builds when running on non-SME hardware
32+
- added an optimized SHGEMM kernel for Neoverse N2
33+
- fixed DYNAMIC_ARCH builds under Windows on Arm
34+
- Added autodetection of Cortex A75/A76 in DYNAMIC_ARCH builds
35+
- Added autodetection of Neoverse V3, currently supported through V2 kernels
36+
- Re-added support for the "VORTEX" target in DYNAMIC_ARCH builds with DYNAMIC_LIST
37+
- Fixed CMake-based builds that use the "Ninja" generator
38+
39+
loongarch64:
40+
- fixed a build failure due to missing support for the new half-precision float type
41+
- fixed a long-standing bug in asserting 64bit capability in the c_check helper script
42+
43+
x86_64:
44+
- added a workaround for miscompilation of the AVX512 GEMM kernels by LLVM on Windows
45+
- fixed a build failure in the LAED3 code when compiling with MinGW on Windows
46+
- fixed CMake-based compilation with the NVIDIA HPC compiler
47+
- Fixed CMake-based builds that use the "Ninja" generator
48+
49+
wasm:
50+
- added optimized kernels for STRSM and DTRSM
51+
252
====================================================================
353
Version 0.3.32
454
23-Mar-2026

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.32
6+
VERSION = 0.3.32.dev
77

88
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
99
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
176176
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
177177
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
178178
- **Cortex A57**: Optimized Level-3 and Level-2 functions
179-
- **Cortex A72**: same as A57 ( different cpu specifications)
179+
- **Cortex A72**: same as A57 (different cpu specifications)
180180
- **Cortex A73**: same as A57 (different cpu specifications)
181181
- **Cortex A76**: same as A57 (different cpu specifications)
182182
- **Falkor**: same as A57 (different cpu specifications)
@@ -189,6 +189,8 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
189189
- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS
190190
- **Neoverse N2**: preliminary support
191191
- **Neoverse V2**: preliminary support
192+
- **Neoverse V3**: preliminary support
193+
- **Neoverse V3AE**: preliminary support
192194
- **Apple Vortex**: preliminary support based on ThunderX2/3
193195
- **Apple VortexM4**: preliminary support based on ThunderX2/3, SME kernels for SGEMM,SSYMM,STRMM,SSYRK,SSYR2K
194196
- **A64FX**: preliminary support, optimized Level-3 BLAS

c_check

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ case "$architecture" in
131131
defined=1
132132
;;
133133
arm|arm64) defined=1 ;;
134-
zarch|e2k|alpha|ia64|riscv64|loonarch64|wasm)
134+
zarch|e2k|alpha|ia64|riscv64|loongarch64|wasm)
135135
defined=1
136136
BINARY=64
137137
;;

cmake/arch.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,11 @@ if (DYNAMIC_ARCH)
4848
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
4949
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
5050
endif ()
51+
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
5152
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17
5253
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4)
5354
endif()
55+
endif()
5456
endif ()
5557
if (DYNAMIC_LIST)
5658
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

cmake/cc.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
113113
endif ()
114114

115115
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
116+
set (GCC_VERSION 100)
116117
if (POWER)
117118
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
118119
elseif (X86_64)

cmake/lapack.cmake

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ set(SLASRC
7171
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
7272
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
7373
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
74-
slarrv.f slartv.f
74+
slarf1f.f slarf1l.f slarrv.f slartv.f
7575
slarz.f slarzb.f slarzt.f slasy2.f
7676
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
7777
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f
@@ -178,6 +178,7 @@ set(CLASRC
178178
claqz0.f claqz1.f claqz2.f claqz3.f
179179
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
180180
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
181+
clarf1f.f clarf1l.f
181182
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
182183
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90
183184
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -262,7 +263,7 @@ set(DLASRC
262263
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
263264
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
264265
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
265-
dlargv.f dlarrv.f dlartv.f
266+
dlarf1f.f dlarf1l.f dlargv.f dlarrv.f dlartv.f
266267
dlarz.f dlarzb.f dlarzt.f dlasy2.f
267268
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
268269
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f
@@ -371,7 +372,7 @@ set(ZLASRC
371372
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
372373
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
373374
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
374-
zlarfg.f zlarfgp.f zlarft.f
375+
zlarfg.f zlarfgp.f zlarft.f zlarf1f.f zlarf1l.f
375376
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
376377
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
377378
zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
@@ -575,7 +576,7 @@ set(SLASRC
575576
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
576577
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
577578
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
578-
slarrv.c slartv.c
579+
slarf1f.c slarf1l.c slarrv.c slartv.c
579580
slarz.c slarzb.c slarzt.c slasy2.c
580581
slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c
581582
slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c
@@ -681,6 +682,7 @@ set(CLASRC
681682
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
682683
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
683684
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
685+
clarf1f.c clarf1l.c
684686
clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c
685687
clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c
686688
clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c
@@ -764,7 +766,7 @@ set(DLASRC
764766
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
765767
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
766768
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
767-
dlargv.c dlarrv.c dlartv.c
769+
dlarf1f.c dlarf1l.c dlargv.c dlarrv.c dlartv.c
768770
dlarz.c dlarzb.c dlarzt.c dlasy2.c
769771
dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c
770772
dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c
@@ -871,7 +873,7 @@ set(ZLASRC
871873
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
872874
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
873875
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
874-
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c
876+
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c zlarf1f.c zlarf1l.c
875877
zlarfg.c zlarfgp.c zlarft.c
876878
zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c
877879
zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c

0 commit comments

Comments
 (0)