Skip to content

Commit 983fca5

Browse files
authored
Merge branch 'develop' into jn/build-exe
2 parents 7414e36 + b227de9 commit 983fca5

1,169 files changed

Lines changed: 43801 additions & 27400 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.cirrus.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ FreeBSD_task:
151151
image_family: freebsd-14-3
152152
install_script:
153153
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
154-
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
154+
- ln -s /usr/local/lib/gcc14/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
155155
compile_script:
156156
- gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
157157

.github/workflows/apple_m.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ jobs:
9999
run: |
100100
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
101101
export CC="/opt/homebrew/opt/llvm/bin/clang"
102+
export RANLIB=llvm-ranlib
102103
case "${{ matrix.build }}" in
103104
"make")
104105
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"

CMakeLists.txt

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ project(OpenBLAS C ASM)
99

1010
set(OpenBLAS_MAJOR_VERSION 0)
1111
set(OpenBLAS_MINOR_VERSION 3)
12-
set(OpenBLAS_PATCH_VERSION 30.dev)
12+
set(OpenBLAS_PATCH_VERSION 31.dev)
1313

1414
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1515

@@ -308,8 +308,8 @@ if (USE_OPENMP)
308308
endif()
309309
endif()
310310

311-
# Fix "Argument list too long" for macOS with POWERPC or Intel CPUs
312-
if(APPLE AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
311+
# Fix "Argument list too long" for macOS - mostly seen with older OS versions on POWERPC or Intel CPUs
312+
if(APPLE)
313313
# Use response files
314314
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
315315
# Always build static library first
@@ -708,6 +708,39 @@ if(NOT NO_LAPACKE)
708708
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
709709
)
710710
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
711+
if (NOT (x${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "x"))
712+
message (STATUS "Generating lapacke.h in ${CMAKE_INSTALL_INCLUDEDIR}")
713+
set(LAPACKE_H ${CMAKE_BINARY_DIR}/generated/lapacke.h)
714+
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke.h LAPACKE_H_CONTENTS)
715+
if (NOT ${SYMBOLPREFIX} STREQUAL "")
716+
string(REGEX REPLACE "(LAPACKE_*)" " ${SYMBOLPREFIX}\\1" LAPACKE_H_CONTENTS_NEW "${LAPACKE_H_CONTENTS}")
717+
string(REPLACE "_ ${SYMBOLPREFIX}LAPACKE_H_" "_LAPACKE_H_" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
718+
string(REPLACE "${SYMBOLPREFIX}LAPACKE_malloc" "LAPACKE_malloc" LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
719+
string(REPLACE "${SYMBOLPREFIX}LAPACKE_free" "LAPACKE_free" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
720+
set(LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
721+
endif()
722+
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
723+
string(REGEX REPLACE "(${SYMBOLPREFIX}LAPACKE_[a-z1-9]*[^ (]*)" "\\1${SYMBOLSUFFIX}" LAPACKE_H_CONTENTS_NEW "${LAPACKE_H_CONTENTS}")
724+
string(REPLACE "#define${SYMBOLSUFFIX}" "#define" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
725+
string(REPLACE "LAPACKE_malloc${SYMBOLSUFFIX}" "LAPACKE_malloc" LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
726+
string(REPLACE "LAPACKE_free${SYMBOLSUFFIX}" "LAPACKE_free" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
727+
set(LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
728+
endif()
729+
file(WRITE ${LAPACKE_H} "${LAPACKE_H_CONTENTS_NEW}")
730+
install (FILES ${LAPACKE_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
731+
message (STATUS "Generating lapack.h in ${CMAKE_INSTALL_INCLUDEDIR}")
732+
set(LAPACK_H ${CMAKE_BINARY_DIR}/generated/lapack.h)
733+
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapack.h LAPACK_H_CONTENTS)
734+
if (NOT ${SYMBOLPREFIX} STREQUAL "")
735+
string(REGEX REPLACE "(LAPACK_[a-z1-9]*[ \(][.\)]*)" "${SYMBOLPREFIX}\\1" LAPACK_H_CONTENTS_NEW "${LAPACK_H_CONTENTS}")
736+
set(LAPACK_H_CONTENTS ${LAPACK_H_CONTENTS_NEW})
737+
endif()
738+
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
739+
string(REGEX REPLACE "(${SYMBOLPREFIX}LAPACK_[a-z1-9]*)([ \(].\)" "\\1${SYMBOLSUFFIX}\\2" LAPACK_H_CONTENTS_NEW "${LAPACK_H_CONTENTS}")
740+
endif()
741+
file(WRITE ${LAPACK_H} "${LAPACK_H_CONTENTS_NEW}")
742+
install (FILES ${LAPACK_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
743+
endif()
711744
endif()
712745

713746
# Install pkg-config files

CONTRIBUTORS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
3030
* Optimizations and other improvements targeting AArch64
3131

32+
* Anna Mayne <anna.mayne@arm.com>
33+
* Optimizations and other improvements targeting AArch64
34+
3235
## Previous Developers
3336

3437
* Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -267,3 +270,8 @@ In chronological order:
267270
* [2025-05-29] Optimise axpby kernel for RISCV64_ZVL256B
268271
* [2025-06-05] Optimise hbmv kernel for RISCV64_ZVL256B
269272

273+
* Anna Mayne <anna.mayne@arm.com>
274+
* [2025-11-19] Update thread throttling profile for SGEMV on NEOVERSEV1 and NEOVERSEV2
275+
276+
* Fadi Arafeh <fadi.arafeh@arm.com>
277+
* [2026-03-05] Accelerate SVE128 SBGEMM/BGEMM

Changelog.txt

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,120 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.31
4+
15-Jan-2025
5+
6+
general:
7+
- reverted a matrix partitioning optimization from 0.3.30 that could lead to
8+
race conditions and subsequent invalid results in GEMM
9+
- added the bfloat16 extensions BGEMM and BGEMV
10+
- added a BLAS interface for the ?GEMM_BATCH extensions
11+
- added the BLAS extensions ?GEMM_BATCH_STRIDED and their CBLAS interface
12+
- added the basic infrastructure for half-precision float (FP16) format
13+
using SH prefix
14+
- reimplemented the LAPACK SLAED3/DLAED3 function using multithreading, thereby
15+
improving the performance of the SSYEVD/DSYEVD eigensolver for symmetric matrices
16+
on all platforms
17+
- limited the number of retries for initial memory allocation to avoid infinite
18+
hanging on low-memory systems
19+
- fixed a thread lockup situation encountered with python 3.9 or older and numpy
20+
- introduced a problem size threshold for multithreading in STRMV/DTRMV
21+
- introduced a problem size threshold for multithreading in CHER/CHER2/CHPR/CHPR2
22+
and ZHER/ZHER2/ZHPR/ZHPR2
23+
- improved the problem size thresholds for multithreading in SGER/DGER
24+
- improved autodetection of the Fortran compiler
25+
- fixed passing of the INTERFACE64=1 option to the flang-new compiler
26+
- fixed a potential deadlock in multithreaded code after calling fork()
27+
- fixed builds using CMake on FreeBSD
28+
- fixed builds using CMake from within Cygwin on Windows
29+
- fixed builds using CMake and the NVHPC compiler on ARM64
30+
- fixed CMake build error from misdetecting compiler or OpenMP versions
31+
- improved contents of the CMake-generated OpenBLASConfig.cmake file
32+
- added support for cross-compilation to RISCV targets via CMake
33+
- fixed cross-compilation to x86 targets from non-x86 architectures
34+
- fixed failure to install cblas.h if NO_CBLAS=0 was specified
35+
- fixed missing user-defined pre- and postfixes on functions in lapack.h,lapacke.h
36+
- included fixes from the Reference-LAPACK project:
37+
- fix ordering bug in ?LAED/?LASD (Reference-LAPACK PR 1140)
38+
- revert changes in ?GEEV from PR 1129 (Reference-LAPACK PR 1142)
39+
- fix workspace allocation in LAPACKE_?TRSEN (Reference-LAPACK PR 1144)
40+
41+
riscv:
42+
- added optimized SBGEMM kernels for ZVL128B and ZVL256B targets
43+
- added optimized SHGEMM kernels for ZVL128B and ZVL256B targets
44+
- added optimized SBGEMV and SHGEMV kernels for ZVL128B/ZVL256B
45+
- improved performance of the GEMV kernel for ZVL256B
46+
- improved the performance of the CROT and ZROT kernels for ZVL128B and x280
47+
- improved the detection of RVV1.0 capability
48+
- improved performance of the matrix packing helper functions for ZVL128B and ZVL256B
49+
- improved performance of OMATCOPY for ZVL128B and ZVL256B
50+
51+
arm:
52+
- fixed spurious executable stack in the getarch utility
53+
54+
arm64:
55+
- fixed spurious executable stack in the getarch utility
56+
- fixed compiler warnings arising from the timer macro RPCC
57+
- fixed cache size detection for Qualcomm Oryon under Windows on Arm
58+
- fixed argument handling in the default SVE kernel for SDOT/DDOT
59+
- building the BFLOAT16 kernels is now enabled by default
60+
- improved the overall performance of GEMM,SYMM and HEMM on A64FX
61+
- improved the performance of SDOT/DDOT on A64FX
62+
- improved the multithreading performance of SDOT/DDOT on A64FX by
63+
introduction of a throttling table matching thread count to problem size
64+
- improved the performance of SGER/DGER on A64FX and NEOVERSEV1
65+
- improved the multithreading performance of GEMM on A64FX and NEOVERSEV1
66+
- improved the performance of the GEMV kernel for SVE-capable targets
67+
- improved the multithreading performance of SGEMM on NEOVERSEV1 and V2
68+
- added optimized SAXPY/DAXPY SVE kernels for A64FX and NEOVERSEV1
69+
- added optimized BGEMM and BGEMV kernels for NEOVERSEV1
70+
- added an optimized BGEMM kernel for NEOVERSEN2
71+
- added support for the NEOVERSEV2 cpu
72+
- added dedicated support for the Apple M4 cpu as VORTEXM4
73+
- added optimized SGEMM/SSYMM/STRMM/SSYRK/SSYR2K for SME-capable targets
74+
(ARMV9SME and VORTEXM4)
75+
- improved the precision of the SNRM2 kernel
76+
- added cpu autodetection and compiler settings for Ampere One processors
77+
- fixed cpu autodetection for Apple M systems running Linux
78+
- fixed building on MacOS with AppleClang,gfortran and xcode v16 or newer
79+
- fixed several errors in the C code replacements for the complex and double
80+
precision complex LAPACK functions that get used (only) when compiling with
81+
Microsoft C and NOFORTRAN=1 under MS Windows
82+
83+
power:
84+
- added initial support for the POWER11 architecture
85+
- improved performance of DGEMM and DGEMV on POWER10
86+
- fixed the default compiler flags to use "-O3" instead of the possibly unsafe
87+
"-Ofast"
88+
- fixed building under MacOS (for old G4 Macs) with CMake
89+
- fixed potential miscompilation of DGEMV and other assembly kernels by gcc15.1
90+
- fixed compilation with recent versions of flang
91+
92+
loongarch64:
93+
- fixed warnings and potential inaccuracies arising from incorrect saving of registers
94+
- fixed enumeration of logical cores on big NUMA servers
95+
- fixed building with LLVM and the INTERFACE64=1 option
96+
97+
x86:
98+
- fixed building the GEMM3M kernels for the GENERIC target
99+
- fixed several errors in the C code replacements for the complex and double
100+
precision complex LAPACK functions that get used (only) when compiling with
101+
Microsoft C and NOFORTRAN=1 under MS Windows
102+
103+
x86_64:
104+
- added cpu autodetection for Intel Lunar Lake (Core Ultra 200V)
105+
- changed all ?MIN and ?MAX assembly kernels to use unaligned operations
106+
- fixed several errors in the C code replacements for the complex and double
107+
precision complex LAPACK functions that get used (only) when compiling with
108+
Microsoft C and NOFORTRAN=1 under MS Windows
109+
- fixed potential crashes in builds for Cooper Lake, Sapphire Rapids or Zen5 cpus
110+
under MS Windows
111+
112+
zarch:
113+
- added support for building with CMake
114+
115+
sparc:
116+
- fixed a potential crash in the DNRM2 kernel
117+
2118
====================================================================
3119
Version 0.3.30
4120
19-Jun-2025

Jenkinsfile.pwr

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,31 @@
11
pipeline {
2-
agent {
3-
docker {
4-
image 'osuosl/ubuntu-ppc64le:18.04'
5-
}
6-
}
2+
agent none
73
stages {
8-
stage('Build') {
4+
stage('GCC build') {
5+
agent {
6+
docker {
7+
image 'osuosl/ubuntu-ppc64le:18.04' // gcc 7, gfortran 7
8+
}
9+
}
910
steps {
11+
checkout scm
1012
sh 'sudo apt update'
1113
sh 'sudo apt install gfortran -y'
1214
sh 'make clean && make'
1315
}
1416
}
17+
stage('Clang build') {
18+
agent {
19+
docker {
20+
image 'osuosl/ubuntu-ppc64le:20.04' // clang 10, gfortran 9
21+
}
22+
}
23+
steps {
24+
checkout scm
25+
sh 'sudo apt update'
26+
sh 'sudo apt install -y clang gfortran'
27+
sh 'make clean && make CC=clang'
28+
}
29+
}
1530
}
1631
}

Makefile.arm64

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ endif
6161
ifeq ($(CORE), ARMV9SME)
6262
CCOMMON_OPT += -march=armv9-a+sve2+sme
6363
FCOMMON_OPT += -march=armv9-a+sve2
64+
ifdef OS_WINDOWS
65+
ifeq ($(C_COMPILER), CLANG)
66+
CCOMMON_OPT += --aarch64-stack-hazard-size=0
67+
endif
68+
endif
6469
endif
6570

6671
ifeq ($(CORE), CORTEXA53)
@@ -303,6 +308,20 @@ FCOMMON_OPT += -march=armv8.3-a
303308
endif
304309
endif
305310

311+
ifeq ($(CORE), VORTEXM4)
312+
ifneq ($(C_COMPILER), GCC)
313+
ifeq ($(APPLECLANG),1)
314+
CCOMMON_OPT += -march=armv8.4-a+sme
315+
else
316+
CCOMMON_OPT += -march=armv8.4-a+sme
317+
override LDFLAGS += -lclang_rt_builtins-aarch64
318+
endif
319+
else
320+
CCOMMON_OPT += -march=armv8.4-a
321+
endif
322+
FCOMMON_OPT += -march=armv8.4-a
323+
endif
324+
306325
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
307326
ifeq ($(CORE), TSV110)
308327
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110

Makefile.install

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,27 @@ endif
9393

9494
ifneq ($(OSNAME), AIX)
9595
ifneq ($(NO_LAPACKE), 1)
96+
@cp $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h lapacke_h.tmp
97+
ifdef SYMBOLPREFIX
98+
@sed 's/LAPACKE_[a-z1-9].[^() ]*/$(SYMBOLPREFIX)&/g' lapacke_h.tmp > lapacke.tmp2
99+
@mv lapacke.tmp2 lapacke_h.tmp
100+
endif
101+
ifdef SYMBOLSUFFIX
102+
@sed 's/LAPACKE_[a-z1-9].[^() ]*/&$(SYMBOLSUFFIX)/g' lapacke_h.tmp > lapacke.tmp2
103+
@mv lapacke.tmp2 lapacke_h.tmp
104+
endif
105+
@-install -m644 lapacke_h.tmp "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
96106
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
97-
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
98-
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
107+
@cp $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h lapack_h.tmp
108+
ifdef SYMBOLPREFIX
109+
@sed 's/LAPACK_[a-z1-9]*(\.\.\.)/$(SYMBOLPREFIX)&/g' lapack_h.tmp > lapack.tmp2
110+
@mv lapack.tmp2 lapack_h.tmp
111+
endif
112+
ifdef SYMBOLSUFFIX
113+
@sed 's/\(#define $(SYMBOLPREFIX)LAPACK_[a-z1-9].*\)\((...)\)/\1$(SYMBOLSUFFIX)\2/g' lapack_h.tmp > lapack.tmp2
114+
@mv lapack.tmp2 lapack_h.tmp
115+
endif
116+
@-install -m644 lapack_h.tmp "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
99117
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
100118
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
101119
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.30.dev
6+
VERSION = 0.3.31.dev
77

88
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
99
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library

Makefile.system

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ HAVE_SSE5=
331331
HAVE_AVX=
332332
HAVE_AVX2=
333333
HAVE_FMA3=
334+
HAVE_SME=
334335
include $(TOPDIR)/Makefile_kernel.conf
335336
endif
336337

@@ -427,7 +428,7 @@ ifndef MACOSX_DEPLOYMENT_TARGET
427428
ifeq ($(ARCH), arm64)
428429
export MACOSX_DEPLOYMENT_TARGET=11.0
429430
export NO_SVE = 1
430-
export NO_SME = 1
431+
# export NO_SME = 1
431432
else
432433
export MACOSX_DEPLOYMENT_TARGET=10.8
433434
endif
@@ -721,6 +722,11 @@ DYNAMIC_CORE += A64FX
721722
endif
722723
ifneq ($(NO_SME), 1)
723724
DYNAMIC_CORE += ARMV9SME
725+
ifeq ($(OSNAME), Darwin)
726+
ifneq ($(C_COMPILER), GCC)
727+
DYNAMIC_CORE += VORTEXM4
728+
endif
729+
endif
724730
endif
725731
DYNAMIC_CORE += THUNDERX
726732
DYNAMIC_CORE += THUNDERX2T99
@@ -1904,6 +1910,7 @@ ifndef NO_MSA
19041910
export HAVE_MSA
19051911
export MSA_FLAGS
19061912
endif
1913+
export HAVE_SME
19071914
export KERNELDIR
19081915
export FUNCTION_PROFILE
19091916
export TARGET_CORE

0 commit comments

Comments
 (0)