Skip to content

Commit cb4e4ce

Browse files
committed
Merge remote-tracking branch 'origin' into develop
2 parents 7ca689b + 1a9cf8e commit cb4e4ce

1,102 files changed

Lines changed: 38083 additions & 19205 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.cirrus.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ FreeBSD_task:
151151
image_family: freebsd-14-3
152152
install_script:
153153
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
154-
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
154+
- ln -s /usr/local/lib/gcc14/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
155155
compile_script:
156156
- gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
157157

.github/workflows/apple_m.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ jobs:
9999
run: |
100100
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
101101
export CC="/opt/homebrew/opt/llvm/bin/clang"
102+
export RANLIB=llvm-ranlib
102103
case "${{ matrix.build }}" in
103104
"make")
104105
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"

.github/workflows/dynamic_arch.yml

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name: continuous build
22

3-
on: [push, pull_request]
3+
on: [push, pull_request, workflow_dispatch]
44

55
concurrency:
66
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -11,18 +11,25 @@ permissions:
1111

1212
jobs:
1313
build:
14-
if: "github.repository == 'OpenMathLib/OpenBLAS'"
14+
if: "github.repository == 'OpenMathLib/OpenBLAS' || github.event_name == 'workflow_dispatch'"
1515
runs-on: ${{ matrix.os }}
1616

1717
strategy:
1818
fail-fast: false
1919
matrix:
20-
os: [ubuntu-latest, macos-latest]
20+
os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm]
21+
cc: [gcc, clang, clang-21]
2122
fortran: [gfortran, flang]
2223
build: [cmake, make]
2324
exclude:
25+
- os: macos-latest
26+
cc: gcc
27+
- os: macos-latest
28+
cc: clang-21
2429
- os: macos-latest
2530
fortran: flang
31+
- os: ubuntu-24.04-arm
32+
fortran: flang
2633

2734
steps:
2835
- name: Checkout repository
@@ -42,10 +49,23 @@ jobs:
4249
- name: Install Dependencies
4350
run: |
4451
if [ "$RUNNER_OS" == "Linux" ]; then
52+
cat << EOF | sudo tee -a /etc/apt/apt.conf.d/01norecommend
53+
APT::Install-Recommends "0";
54+
APT::Install-Suggests "0";
55+
EOF
4556
sudo apt-get update
46-
sudo apt-get install -y gfortran cmake ccache
47-
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
48-
sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
57+
sudo apt-get install -y ccache
58+
if [ "${{ matrix.cc }}" == "clang-21" ]; then
59+
wget https://apt.llvm.org/llvm.sh
60+
chmod +x llvm.sh
61+
sudo ./llvm.sh 21
62+
fi
63+
if [ "${{ matrix.fortran }}" == "flang" ]; then
64+
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
65+
sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
66+
else
67+
sudo apt-get install -y ${{ matrix.fortran }}
68+
fi
4969
elif [ "$RUNNER_OS" == "macOS" ]; then
5070
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
5171
brew reinstall gcc
@@ -64,12 +84,12 @@ jobs:
6484
# GNU make and cmake call the compilers differently. It looks like
6585
# that causes the cache to mismatch. Keep the ccache for both build
6686
# tools separate to avoid polluting each other.
67-
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
87+
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
6888
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
6989
restore-keys: |
70-
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
71-
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
72-
ccache-${{ runner.os }}-${{ matrix.build }}
90+
ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}-${{ matrix.fortran }}-${{ github.ref }}
91+
ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}-${{ matrix.fortran }}
92+
ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build }}-${{ matrix.cc }}
7393
7494
- name: Configure ccache
7595
run: |
@@ -89,7 +109,7 @@ jobs:
89109
echo "max_size = 300M" > ~/.ccache/ccache.conf
90110
echo "compression = true" >> ~/.ccache/ccache.conf
91111
ccache -s
92-
112+
93113
- name: Add gfortran runtime to link path
94114
if: matrix.build == 'make' && runner.os == 'macOS'
95115
run: |
@@ -110,7 +130,7 @@ jobs:
110130
fi
111131
case "${{ matrix.build }}" in
112132
"make")
113-
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
133+
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}"
114134
;;
115135
"cmake")
116136
mkdir build && cd build
@@ -119,6 +139,7 @@ jobs:
119139
-DBUILD_WITHOUT_LAPACK=0 \
120140
-DCMAKE_VERBOSE_MAKEFILE=ON \
121141
-DCMAKE_BUILD_TYPE=Release \
142+
-DCMAKE_C_COMPILER=${{ matrix.cc }} \
122143
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
123144
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
124145
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
@@ -142,13 +163,13 @@ jobs:
142163
"make")
143164
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
144165
echo "::group::Tests in 'test' directory"
145-
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
166+
make -C test $MAKE_FLAGS CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}"
146167
echo "::endgroup::"
147168
echo "::group::Tests in 'ctest' directory"
148-
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
169+
make -C ctest $MAKE_FLAGS CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}"
149170
echo "::endgroup::"
150171
echo "::group::Tests in 'utest' directory"
151-
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
172+
make -C utest $MAKE_FLAGS CC="ccache ${{ matrix.cc }}" FC="ccache ${{ matrix.fortran }}"
152173
echo "::endgroup::"
153174
;;
154175
"cmake")
@@ -372,15 +393,15 @@ jobs:
372393
steps:
373394
- name: Checkout repository
374395
uses: actions/checkout@v3
375-
396+
376397
- name: Install Dependencies
377398
run: |
378399
sudo apt-get update
379400
sudo apt-get install -y gcc gfortran make
380-
401+
381402
- name: Build OpenBLAS
382403
run: |
383-
make -j${nproc}
404+
make -j${nproc}
384405
make -j${nproc} lapack-test
385-
386-
406+
407+

.github/workflows/riscv64_vector.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ jobs:
140140
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
141141
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
142142
if [ "${{matrix.target}}" == "RISCV64_ZVL256B" ]; then
143+
qemu-riscv64 test/test_sbgemm &
144+
qemu-riscv64 test/test_sbgemv &
143145
qemu-riscv64 test/test_shgemm &
144146
qemu-riscv64 test/test_shgemv &
145147
qemu-riscv64 test/test_bgemm

CMakeLists.txt

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ project(OpenBLAS C ASM)
99

1010
set(OpenBLAS_MAJOR_VERSION 0)
1111
set(OpenBLAS_MINOR_VERSION 3)
12-
set(OpenBLAS_PATCH_VERSION 30.dev)
12+
set(OpenBLAS_PATCH_VERSION 31.dev)
1313

1414
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1515

@@ -308,8 +308,8 @@ if (USE_OPENMP)
308308
endif()
309309
endif()
310310

311-
# Fix "Argument list too long" for macOS with POWERPC or Intel CPUs
312-
if(APPLE AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
311+
# Fix "Argument list too long" for macOS - mostly seen with older OS versions on POWERPC or Intel CPUs
312+
if(APPLE)
313313
# Use response files
314314
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
315315
# Always build static library first
@@ -708,6 +708,39 @@ if(NOT NO_LAPACKE)
708708
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
709709
)
710710
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
711+
if (NOT (x${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "x"))
712+
message (STATUS "Generating lapacke.h in ${CMAKE_INSTALL_INCLUDEDIR}")
713+
set(LAPACKE_H ${CMAKE_BINARY_DIR}/generated/lapacke.h)
714+
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke.h LAPACKE_H_CONTENTS)
715+
if (NOT ${SYMBOLPREFIX} STREQUAL "")
716+
string(REGEX REPLACE "(LAPACKE_*)" " ${SYMBOLPREFIX}\\1" LAPACKE_H_CONTENTS_NEW "${LAPACKE_H_CONTENTS}")
717+
string(REPLACE "_ ${SYMBOLPREFIX}LAPACKE_H_" "_LAPACKE_H_" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
718+
string(REPLACE "${SYMBOLPREFIX}LAPACKE_malloc" "LAPACKE_malloc" LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
719+
string(REPLACE "${SYMBOLPREFIX}LAPACKE_free" "LAPACKE_free" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
720+
set(LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
721+
endif()
722+
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
723+
string(REGEX REPLACE "(${SYMBOLPREFIX}LAPACKE_[a-z1-9]*[^ (]*)" "\\1${SYMBOLSUFFIX}" LAPACKE_H_CONTENTS_NEW "${LAPACKE_H_CONTENTS}")
724+
string(REPLACE "#define${SYMBOLSUFFIX}" "#define" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
725+
string(REPLACE "LAPACKE_malloc${SYMBOLSUFFIX}" "LAPACKE_malloc" LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
726+
string(REPLACE "LAPACKE_free${SYMBOLSUFFIX}" "LAPACKE_free" LAPACKE_H_CONTENTS ${LAPACKE_H_CONTENTS_NEW})
727+
set(LAPACKE_H_CONTENTS_NEW ${LAPACKE_H_CONTENTS})
728+
endif()
729+
file(WRITE ${LAPACKE_H} "${LAPACKE_H_CONTENTS_NEW}")
730+
install (FILES ${LAPACKE_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
731+
message (STATUS "Generating lapack.h in ${CMAKE_INSTALL_INCLUDEDIR}")
732+
set(LAPACK_H ${CMAKE_BINARY_DIR}/generated/lapack.h)
733+
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapack.h LAPACK_H_CONTENTS)
734+
if (NOT ${SYMBOLPREFIX} STREQUAL "")
735+
string(REGEX REPLACE "(LAPACK_[a-z1-9]*[ \(][.\)]*)" "${SYMBOLPREFIX}\\1" LAPACK_H_CONTENTS_NEW "${LAPACK_H_CONTENTS}")
736+
set(LAPACK_H_CONTENTS ${LAPACK_H_CONTENTS_NEW})
737+
endif()
738+
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
739+
string(REGEX REPLACE "(${SYMBOLPREFIX}LAPACK_[a-z1-9]*)([ \(].\)" "\\1${SYMBOLSUFFIX}\\2" LAPACK_H_CONTENTS_NEW "${LAPACK_H_CONTENTS}")
740+
endif()
741+
file(WRITE ${LAPACK_H} "${LAPACK_H_CONTENTS_NEW}")
742+
install (FILES ${LAPACK_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
743+
endif()
711744
endif()
712745

713746
# Install pkg-config files

CONTRIBUTORS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
3030
* Optimizations and other improvements targeting AArch64
3131

32+
* Anna Mayne <anna.mayne@arm.com>
33+
* Optimizations and other improvements targeting AArch64
34+
3235
## Previous Developers
3336

3437
* Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -267,3 +270,5 @@ In chronological order:
267270
* [2025-05-29] Optimise axpby kernel for RISCV64_ZVL256B
268271
* [2025-06-05] Optimise hbmv kernel for RISCV64_ZVL256B
269272

273+
* Anna Mayne <anna.mayne@arm.com>
274+
* [2025-11-19] Update thread throttling profile for SGEMV on NEOVERSEV1 and NEOVERSEV2

Changelog.txt

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,120 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.31
4+
15-Jan-2025
5+
6+
general:
7+
- reverted a matrix partitioning optimization from 0.3.30 that could lead to
8+
race conditions and subsequent invalid results in GEMM
9+
- added the bfloat16 extensions BGEMM and BGEMV
10+
- added a BLAS interface for the ?GEMM_BATCH extensions
11+
- added the BLAS extensions ?GEMM_BATCH_STRIDED and their CBLAS interface
12+
- added the basic infrastructure for half-precision float (FP16) format
13+
using SH prefix
14+
- reimplemented the LAPACK SLAED3/DLAED3 function using multithreading, thereby
15+
improving the performance of the SSYEVD/DSYEVD eigensolver for symmetric matrices
16+
on all platforms
17+
- limited the number of retries for initial memory allocation to avoid infinite
18+
hanging on low-memory systems
19+
- fixed a thread lockup situation encountered with python 3.9 or older and numpy
20+
- introduced a problem size threshold for multithreading in STRMV/DTRMV
21+
- introduced a problem size threshold for multithreading in CHER/CHER2/CHPR/CHPR2
22+
and ZHER/ZHER2/ZHPR/ZHPR2
23+
- improved the problem size thresholds for multithreading in SGER/DGER
24+
- improved autodetection of the Fortran compiler
25+
- fixed passing of the INTERFACE64=1 option to the flang-new compiler
26+
- fixed a potential deadlock in multithreaded code after calling fork()
27+
- fixed builds using CMake on FreeBSD
28+
- fixed builds using CMake from within Cygwin on Windows
29+
- fixed builds using CMake and the NVHPC compiler on ARM64
30+
- fixed CMake build error from misdetecting compiler or OpenMP versions
31+
- improved contents of the CMake-generated OpenBLASConfig.cmake file
32+
- added support for cross-compilation to RISCV targets via CMake
33+
- fixed cross-compilation to x86 targets from non-x86 architectures
34+
- fixed failure to install cblas.h if NO_CBLAS=0 was specified
35+
- fixed missing user-defined pre- and postfixes on functions in lapack.h,lapacke.h
36+
- included fixes from the Reference-LAPACK project:
37+
- fix ordering bug in ?LAED/?LASD (Reference-LAPACK PR 1140)
38+
- revert changes in ?GEEV from PR 1129 (Reference-LAPACK PR 1142)
39+
- fix workspace allocation in LAPACKE_?TRSEN (Reference-LAPACK PR 1144)
40+
41+
riscv:
42+
- added optimized SBGEMM kernels for ZVL128B and ZVL256B targets
43+
- added optimized SHGEMM kernels for ZVL128B and ZVL256B targets
44+
- added optimized SBGEMV and SHGEMV kernels for ZVL128B/ZVL256B
45+
- improved performance of the GEMV kernel for ZVL256B
46+
- improved the performance of the CROT and ZROT kernels for ZVL128B and x280
47+
- improved the detection of RVV1.0 capability
48+
- improved performance of the matrix packing helper functions for ZVL128B and ZVL256B
49+
- improved performance of OMATCOPY for ZVL128B and ZVL256B
50+
51+
arm:
52+
- fixed spurious executable stack in the getarch utility
53+
54+
arm64:
55+
- fixed spurious executable stack in the getarch utility
56+
- fixed compiler warnings arising from the timer macro RPCC
57+
- fixed cache size detection for Qualcomm Oryon under Windows on Arm
58+
- fixed argument handling in the default SVE kernel for SDOT/DDOT
59+
- building the BFLOAT16 kernels is now enabled by default
60+
- improved the overall performance of GEMM,SYMM and HEMM on A64FX
61+
- improved the performance of SDOT/DDOT on A64FX
62+
- improved the multithreading performance of SDOT/DDOT on A64FX by
63+
introduction of a throttling table matching thread count to problem size
64+
- improved the performance of SGER/DGER on A64FX and NEOVERSEV1
65+
- improved the multithreading performance of GEMM on A64FX and NEOVERSEV1
66+
- improved the performance of the GEMV kernel for SVE-capable targets
67+
- improved the multithreading performance of SGEMM on NEOVERSEV1 and V2
68+
- added optimized SAXPY/DAXPY SVE kernels for A64FX and NEOVERSEV1
69+
- added optimized BGEMM and BGEMV kernels for NEOVERSEV1
70+
- added an optimized BGEMM kernel for NEOVERSEN2
71+
- added support for the NEOVERSEV2 cpu
72+
- added dedicated support for the Apple M4 cpu as VORTEXM4
73+
- added optimized SGEMM/SSYMM/STRMM/SSYRK/SSYR2K for SME-capable targets
74+
(ARMV9SME and VORTEXM4)
75+
- improved the precision of the SNRM2 kernel
76+
- added cpu autodetection and compiler settings for Ampere One processors
77+
- fixed cpu autodetection for Apple M systems running Linux
78+
- fixed building on MacOS with AppleClang,gfortran and xcode v16 or newer
79+
- fixed several errors in the C code replacements for the complex and double
80+
precision complex LAPACK functions that get used (only) when compiling with
81+
Microsoft C and NOFORTRAN=1 under MS Windows
82+
83+
power:
84+
- added initial support for the POWER11 architecture
85+
- improved performance of DGEMM and DGEMV on POWER10
86+
- fixed the default compiler flags to use "-O3" instead of the possibly unsafe
87+
"-Ofast"
88+
- fixed building under MacOS (for old G4 Macs) with CMake
89+
- fixed potential miscompilation of DGEMV and other assembly kernels by gcc15.1
90+
- fixed compilation with recent versions of flang
91+
92+
loongarch64:
93+
- fixed warnings and potential inaccuracies arising from incorrect saving of registers
94+
- fixed enumeration of logical cores on big NUMA servers
95+
- fixed building with LLVM and the INTERFACE64=1 option
96+
97+
x86:
98+
- fixed building the GEMM3M kernels for the GENERIC target
99+
- fixed several errors in the C code replacements for the complex and double
100+
precision complex LAPACK functions that get used (only) when compiling with
101+
Microsoft C and NOFORTRAN=1 under MS Windows
102+
103+
x86_64:
104+
- added cpu autodetection for Intel Lunar Lake (Core Ultra 200V)
105+
- changed all ?MIN and ?MAX assembly kernels to use unaligned operations
106+
- fixed several errors in the C code replacements for the complex and double
107+
precision complex LAPACK functions that get used (only) when compiling with
108+
Microsoft C and NOFORTRAN=1 under MS Windows
109+
- fixed potential crashes in builds for Cooper Lake, Sapphire Rapids or Zen5 cpus
110+
under MS Windows
111+
112+
zarch:
113+
- added support for building with CMake
114+
115+
sparc:
116+
- fixed a potential crash in the DNRM2 kernel
117+
2118
====================================================================
3119
Version 0.3.30
4120
19-Jun-2025

0 commit comments

Comments
 (0)