Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
138 commits
Select commit Hold shift + click to select a range
548a9f3
Merge remote-tracking branch 'origin/develop' into HEAD
ChipKerchner Mar 11, 2026
376d3a1
Fast performing edges for FP32 GEMM of RVV.
ChipKerchner Mar 12, 2026
6d6af1d
Add bool types for C.
ChipKerchner Mar 12, 2026
9c16449
Add K-unrolling to M = 8. Other small changes.
ChipKerchner Mar 13, 2026
fda433f
Unroll K for N less than or equal to 4.
ChipKerchner Mar 13, 2026
eb9bbcc
Common unroll code.
ChipKerchner Mar 14, 2026
b0ee407
Preserve K.
ChipKerchner Mar 14, 2026
010f24f
Better K.
ChipKerchner Mar 16, 2026
f927b94
Global optimizations.
ChipKerchner Mar 16, 2026
79d9fe3
Use mf2 instead of m1.
ChipKerchner Mar 17, 2026
477dd40
Simplier loops.
ChipKerchner Mar 17, 2026
d832ee5
More global optimzation and clean up.
ChipKerchner Mar 18, 2026
1e48686
Merge remote-tracking branch 'origin/develop' into fasterRVVEdges
ChipKerchner Mar 19, 2026
a8a00bb
Avoid greater than 4 segment load and store penalties by using 2. Fi…
ChipKerchner Mar 19, 2026
1bb72b2
Only initialize unused variables to prevent GCC warnings.
ChipKerchner Mar 20, 2026
ebf4cd1
Fix typo.
ChipKerchner Mar 22, 2026
821242e
Merge pull request #5706 from OpenMathLib/release-0.3.0
martin-frbg Mar 23, 2026
d511552
Update version to 0.3.32.dev
martin-frbg Mar 23, 2026
b8697b3
Update version to 0.3.32.dev
martin-frbg Mar 23, 2026
c114ca5
Reapply "build: fix rule for building dynamic files"
vtjnash Mar 24, 2026
8fc0004
Fix another typo.
ChipKerchner Mar 24, 2026
8697164
c_check: loongarch64: Fix typo
iv-m Mar 24, 2026
e3ce462
Use volatile attribute for SDOT only, to avoid creating new miscompil…
martin-frbg Mar 24, 2026
1f1fcd4
Merge pull request #5709 from iv-m/loongarch64-fix-typo
martin-frbg Mar 24, 2026
f6d4fe7
Fix incorrect cast from BF16 to FP32 in SBGEMM
murste01 Mar 26, 2026
7086a1b
typedef the unsupported fp16 as bfloat16 on Loongarch64 too
martin-frbg Mar 26, 2026
3c188e4
Merge pull request #5712 from murste01/develop
martin-frbg Mar 27, 2026
2671786
Merge pull request #5715 from martin-frbg/issue5714
martin-frbg Mar 27, 2026
e6eba9f
Add optimized FP16 shgemm for for NEOVERSEN2 target
yuanjia111 Mar 27, 2026
b8dbc4a
Merge pull request #5716 from yuanjia111/develop
martin-frbg Mar 27, 2026
75511cb
POSIX strncasecmp is strnicmp in Windows on Arm
martin-frbg Mar 27, 2026
0315003
Do not build SME targets in DYNAMIC_ARCH under Windows
martin-frbg Mar 27, 2026
3ebfc0e
Merge pull request #5718 from martin-frbg/issue5625
martin-frbg Mar 27, 2026
0f9f6e4
Merge pull request #5710 from martin-frbg/issue5708
martin-frbg Mar 27, 2026
16211b7
Add CortexA75/76 via CortexA73 and restore VORTEX for use with DYNAMI…
martin-frbg Mar 29, 2026
d26960a
Merge pull request #5719 from martin-frbg/issue5713
martin-frbg Mar 30, 2026
605b128
Add ?LARF1F and ?LARF1L (Reference-LAPACK PRs 1019/1020)
martin-frbg Mar 30, 2026
f5f789f
Implement ?LARF1F and ?ORM2R (Reference-LAPACK PRs 1019/1020/1196)
martin-frbg Mar 30, 2026
d9bb8f3
Implement ?LARF1F and ?ORM2R (Reference-LAPACK PRs 1019/1020/1196)
martin-frbg Mar 30, 2026
4342764
Implement ?LARF1F and ?ORM2R (Reference-LAPACK PRs 1019/1020/1196)
martin-frbg Mar 30, 2026
af63f2a
Add C replacements for ?LARF1F/?LARF1L
martin-frbg Mar 30, 2026
a5d0f89
Add C replacements for ?LARF1F/?LARF1L
martin-frbg Mar 30, 2026
ff5dc3e
Change loop ordering to improve performance (Reference-LAPACK PR 1023)
martin-frbg Mar 30, 2026
d69be17
Convert 2X LMUL1 instructions to 1X LMUL2. Improved FP64 GEMM edges …
ChipKerchner Mar 30, 2026
904f9d6
Merge pull request #5721 from martin-frbg/lapack1020
martin-frbg Mar 30, 2026
a03cd30
Change WORK(LWORK) to WORK(*) (Reference-LAPACK PR 1094)
martin-frbg Mar 30, 2026
e19e140
Add NaN checks for input matrix A (Reference-LAPACK PR 1136)
martin-frbg Mar 30, 2026
55d7dd8
Fix support for jobu and jobv (Reference-LAPACK PR 1146)
martin-frbg Mar 30, 2026
edad2a8
Fix display of minor version number (Reference-LAPACK PR 1149)
martin-frbg Mar 30, 2026
1243314
Fix display of minor version number (Reference-LAPACK PR 1149)
martin-frbg Mar 30, 2026
004cf0d
Fix seed to avoid FMA-sensitive ill-conditioned matrix (Reference-LAP…
martin-frbg Mar 30, 2026
6dad37f
Merge pull request #5722 from martin-frbg/lapack1023
martin-frbg Mar 31, 2026
37e189c
Fix truncation of large workspace values (Reference-LAPACK PR 1195)
martin-frbg Mar 31, 2026
6e89813
Fix spurious overwriting of caller variable LDSWORK (Reference-LAPACK…
martin-frbg Mar 31, 2026
391cbf8
Pass IINFO instead of INFO to ??PGVX (Reference-LAPACK PR 1207)
martin-frbg Mar 31, 2026
f085c70
Remove unused parameter (Reference-LAPACK PR 1209)
martin-frbg Mar 31, 2026
844939a
Enable testing of the driver routines (Reference-LAPACK PR 1211)
martin-frbg Mar 31, 2026
e486254
Merge pull request #5723 from martin-frbg/lapack1094
martin-frbg Mar 31, 2026
4bbb9fe
Fix workspace size (Reference-LAPACK PR 774)
martin-frbg Mar 31, 2026
cc74393
Fix workspace size (Reference-LAPACK PR 774)
martin-frbg Mar 31, 2026
66cc9f0
Merge pull request #5724 from martin-frbg/lapack1136
martin-frbg Mar 31, 2026
daa3215
Remove shadow variable.
ChipKerchner Mar 31, 2026
aec6170
Merge pull request #5725 from martin-frbg/lapack1146
martin-frbg Mar 31, 2026
664f176
Merge pull request #5726 from martin-frbg/lapack1149
martin-frbg Mar 31, 2026
9816062
Merge pull request #5727 from martin-frbg/lapack1187
martin-frbg Mar 31, 2026
f1f36c0
Merge pull request #5729 from martin-frbg/lapack1195
martin-frbg Mar 31, 2026
eaeaf95
Merge pull request #5731 from martin-frbg/lapack1207
martin-frbg Mar 31, 2026
62dcdca
Merge pull request #5733 from martin-frbg/lapack1211
martin-frbg Mar 31, 2026
3f2338b
Merge pull request #5732 from martin-frbg/lapack1209
martin-frbg Mar 31, 2026
429d23f
Merge pull request #5730 from martin-frbg/lapack1206
martin-frbg Mar 31, 2026
d27e98c
Merge pull request #5734 from martin-frbg/lapack774
martin-frbg Apr 1, 2026
b9ba9be
Follow-up on ?GESVDQ updates from PR1146 (Reference-LAPACK PR 1221)
martin-frbg Apr 1, 2026
c6e4d17
Follow-up on ?GESVDQ updates from PR 1146 (Reference-LAPACK PR 1221)
martin-frbg Apr 1, 2026
7dde52d
Follow-up on ?GESVDQ updates from PR 1146 (Reference-LAPACK PR 1221)
martin-frbg Apr 1, 2026
93515c2
Merge pull request #5736 from martin-frbg/lapack1221
martin-frbg Apr 1, 2026
47be0d8
Fix access beyond array size
martin-frbg Apr 2, 2026
3bbd755
Add note on using an x86 OpenBLAS in Windows on Arm via Prism
martin-frbg Apr 2, 2026
bef5f1c
Merge pull request #5740 from martin-frbg/issue5739
martin-frbg Apr 2, 2026
9b3cc78
Merge pull request #5741 from martin-frbg/issue5696
martin-frbg Apr 2, 2026
3b1aef1
Use LMUL2 loads in main block.
ChipKerchner Apr 2, 2026
22b7950
Use LMUL2 for calculations in main block - just break them apart befo…
ChipKerchner Apr 2, 2026
cc1b579
Reduce number of vectors in use from 32 to 24 for last stage of main …
ChipKerchner Apr 2, 2026
0a4d6b2
Forgot files from previous check-in.
ChipKerchner Apr 2, 2026
f272216
lapack/laed3: fix MinGW build for slaed3
nakatamaho Apr 6, 2026
a04ea2b
MIPS: fix implicit declarations found in the cpuinfo detector
chenx97 Apr 7, 2026
dc32a8a
Try to find out if _Float16 is available on the target before using it
martin-frbg Apr 7, 2026
e41cb1a
Merge pull request #5748 from martin-frbg/issue5747
martin-frbg Apr 7, 2026
fb45e7d
CirrusCI: Fix ranlib confusion between xcode and AndroidNDK (#5749)
martin-frbg Apr 7, 2026
e875a9c
Remove redundant C implemetations from MIPS directories
chenx97 Apr 5, 2026
e926bb0
fix missing endif
martin-frbg Apr 8, 2026
9d9fcc1
Merge pull request #5752 from martin-frbg/fixup5748
martin-frbg Apr 8, 2026
75a9960
Merge pull request #5744 from nakatamaho/fix/slamc3
martin-frbg Apr 8, 2026
9a0f76a
Merge pull request #5746 from chenx97/mips-fix-implicit-declaration
martin-frbg Apr 8, 2026
f9f8e94
Fix DROUNDUP_LWORK: patch was not fully copied
foxtran Apr 9, 2026
6a5d214
Fix dsdot precision for arm/dot.c
chenx97 Apr 8, 2026
1aea1d6
Merge pull request #5753 from foxtran/fix/1203
martin-frbg Apr 9, 2026
0ea2348
Use ROUNDUP_LWORK and remove redundant conversions (Reference-LAPACK …
martin-frbg Apr 9, 2026
2c80f8c
Merge pull request #5755 from martin-frbg/fixup5702
martin-frbg Apr 9, 2026
646d0c9
Merge pull request #5751 from chenx97/mips-dedup-c-impl
martin-frbg Apr 9, 2026
94e053a
Work around miscompilation of the AVX512 ?GEMM kernels by Windows LLVM
martin-frbg Apr 11, 2026
172f41c
Merge pull request #5674 from ChipKerchner/fasterRVVEdges
martin-frbg Apr 11, 2026
b9da7db
Quote the respective SUMM file on failure in BLAS2/3 tests
martin-frbg Apr 12, 2026
d9786d3
fix missing eol
martin-frbg Apr 14, 2026
c59578f
fix conditionals
martin-frbg Apr 14, 2026
70faa9f
Merge pull request #5756 from OpenMathLib/issue5267
martin-frbg Apr 15, 2026
1d15733
Fix typos and initialize cutoff value (Reference-LAPACK PRs 1212&1228)
martin-frbg Apr 15, 2026
7073b68
Merge pull request #5757 from martin-frbg/lapack1212+1228
martin-frbg Apr 15, 2026
330abcd
Merge pull request #5707 from vtjnash/jn/makefile-rule-dynamic-2
martin-frbg Apr 15, 2026
822b873
Automatically assume BIGNUMA if more than 256 CPUs
martin-frbg Apr 15, 2026
3a3903a
Add CPU ID for Neoverse V3
nSircombe Apr 15, 2026
5f4a5b3
Update (c) years and contributors
nSircombe Apr 16, 2026
2847354
Allow target-specific GEMM and TRSM overrides
teddygood Apr 16, 2026
ecca5e4
Add WASM128_GENERIC STRSM and DTRSM kernels
teddygood Apr 16, 2026
fe23ead
Merge pull request #5758 from martin-frbg/spack_bignuma
martin-frbg Apr 16, 2026
4991de5
Merge pull request #5759 from nSircombe/feature/v3support
martin-frbg Apr 16, 2026
b77cd0a
Merge pull request #5760 from teddygood/wasm-trsm
martin-frbg Apr 16, 2026
2a5d33a
Add part code for NeoverseV3
martin-frbg Apr 16, 2026
b8bb6d0
Merge pull request #5761 from martin-frbg/fixup5759
martin-frbg Apr 16, 2026
6f0dfd5
Guard against eventual overflow of the config string
martin-frbg Apr 16, 2026
02dc625
Merge pull request #5762 from martin-frbg/issue5750
martin-frbg Apr 16, 2026
c87e4fc
Disable gcc-specific version checks for NVIDIA HPC
martin-frbg Apr 21, 2026
41dbca9
Merge pull request #5765 from martin-frbg/issue5764
martin-frbg Apr 21, 2026
775f467
Optimize looping over the lower triangular in fat matrix cases
martin-frbg Apr 21, 2026
9a46ffb
Merge pull request #5766 from martin-frbg/lapack1251
martin-frbg Apr 21, 2026
7448679
Move quick return out of the scope of the DYNAMIC_ARCH conditional fo…
martin-frbg Apr 22, 2026
fd862d4
Remove redundant quick return
martin-frbg Apr 22, 2026
59cfea0
Move quick return out of the scope of the potential DYNAMIC_ARCH check
martin-frbg Apr 22, 2026
d270dcb
Using response files on Mac is incompatible (and unnecessary) with Ninja
martin-frbg Apr 22, 2026
54b939e
Undefine GEMM_PREFERRED_SIZE before redefining to silence warning
martin-frbg Apr 22, 2026
c1bb49d
Omit the entire APPLE-specific block if using Ninja
martin-frbg Apr 22, 2026
ded9a96
Merge pull request #5767 from martin-frbg/issue5763
martin-frbg Apr 22, 2026
e447f2c
Merge pull request #5770 from martin-frbg/undef_prefsize
martin-frbg Apr 22, 2026
07e7594
Merge pull request #5769 from martin-frbg/issue5768
martin-frbg Apr 22, 2026
70d1c2f
Fix EXTERNAL declarations (Reference-LAPACK PR 1257)
martin-frbg Apr 23, 2026
c735618
Merge pull request #5771 from martin-frbg/lapack1257
martin-frbg Apr 23, 2026
81bf61b
Update for 0.3.33
martin-frbg Apr 23, 2026
2bbd111
Merge pull request #5772 from martin-frbg/changelog0333
martin-frbg Apr 23, 2026
b0a0364
Merge branch 'release-0.3.0' into develop
martin-frbg Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
6 changes: 4 additions & 2 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,16 @@ task:
type: text/plain

macos_instance:
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
image: ghcr.io/cirruslabs/macos-tahoe-xcode:latest
task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- brew install --cask android-ndk
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
- export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
- export AR=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar
- export RANLIB=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ranlib
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
always:
config_artifacts:
path: "*conf*"
Expand Down
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,8 @@ if (USE_OPENMP)
endif()

# Fix "Argument list too long" for macOS - mostly seen with older OS versions on POWERPC or Intel CPUs
if(APPLE)
# Use response files
if(APPLE AND "${CMAKE_GENERATOR}" MATCHES ".*Makefiles")
# Use response files to get around the ARG_MAX limit, unless using the Ninja generator
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
# Always build static library first
if(BUILD_STATIC_LIBS)
Expand All @@ -333,7 +333,7 @@ if(APPLE)
endif()
if(NOT NOFORTRAN)
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND})
if(BUILD_SHARED_LIBS)
set(CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
Expand Down
3 changes: 3 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,6 @@ In chronological order:

* Fadi Arafeh <fadi.arafeh@arm.com>
* [2026-03-05] Accelerate SVE128 SBGEMM/BGEMM

* Nathan Sircombe <nathan.sircombe@arm.com>
* [2026-04-16] Add CPU ID for Neoverse V3
50 changes: 50 additions & 0 deletions Changelog.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,54 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.33
23-Apr-2026

general:
- fixed an incorrect cast in the SBGEMM test case that could lead to spurious test failures
- fixed an invalid memory access in the converted C version of the CBLAS tests
- made the BIGNUMA setting automatic when the number of cores exceeds 256
- Imported recent updates from Reference-LAPACK to realign with its upcoming 3.13.0 release:
- Implement ?LARF1F and ?ORM2R (Reference-LAPACK PRs 1019,1020,1196,1257)
- Change loop order in ?GETC2 to improve performance (Reference-LAPACK PR 1023)
- Change WORK array dimension in ?GELQS/?GEQRS (Reference-LAPACK PR 1094)
- Add NaN checks for input matrix A in ?GEEV (Reference-LAPACK PR 1136)
- Fix support for jobu/v in LAPACKE_?GESVDQ_WORK (Reference-LAPACK PRs 1146,1221)
- Fix display of version number in LAPACK testsuite (Reference-LAPACK PR 1149)
- Fix DGGES test seed to avoid bad matrix cases (Reference-LAPACK PR 1187)
- Fix truncation of large WORK array sizes in ZHE (Reference-LAPACK PR 1195)
- Fix overwriting of LDSWORK parameter in ?TRSYL3 (Reference-LAPACK PR 1206)
- Fix overwriting of error states in some EIG tests (Reference-LAPACK PR 1207)
- Remove unused parameter in DORBDB3/ZUNBDB3 (Reference-LAPACK PR 1209)
- Re-enable testing of ?BB and ?GG driver functions (Reference-LAPACK PR 1211)
- Fix workspace size calculation in ?TGSEN (Reference-LAPACK PR 774)
- Fix typos in the EIG DMD tests and initialized the cutoff variable (PR 1212,1228)
- Optimized looping in ?LACPY/?LASCL/?LANTR with fat matrix and UPLO=L (PR 1251)

arm64:
- worked around a serious miscompilation of the DDOT kernel by GCC15, affecting
most non-SVE targets, and SVE targets in the case of non-unit array stride)
- fixed an accuracy issue in the GEMV kernel for Neoverse V1 and other SVE targets
- fixed broken STRMM and SSYMM in DYNAMIC_ARCH builds when running on non-SME hardware
- added an optimized SHGEMM kernel for Neoverse N2
- fixed DYNAMIC_ARCH builds under Windows on Arm
- Added autodetection of Cortex A75/A76 in DYNAMIC_ARCH builds
- Added autodetection of Neoverse V3, currently supported through V2 kernels
- Re-added support for the "VORTEX" target in DYNAMIC_ARCH builds with DYNAMIC_LIST
- Fixed CMake-based builds that use the "Ninja" generator

loongarch64:
- fixed a build failure due to missing support for the new half-precision float type
- fixed a long-standing bug in asserting 64bit capability in the c_check helper script

x86_64:
- added a workaround for miscompilation of the AVX512 GEMM kernels by LLVM on Windows
- fixed a build failure in the LAED3 code when compiling with MinGW on Windows
- fixed CMake-based compilation with the NVIDIA HPC compiler
- Fixed CMake-based builds that use the "Ninja" generator

wasm:
- added optimized kernels for STRSM and DTRSM

====================================================================
Version 0.3.32
23-Mar-2026
Expand Down
2 changes: 1 addition & 1 deletion Makefile.rule
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#

# This library's version
VERSION = 0.3.32
VERSION = 0.3.32.dev

# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A72**: same as A57 (different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
- **Cortex A76**: same as A57 (different cpu specifications)
- **Falkor**: same as A57 (different cpu specifications)
Expand All @@ -189,6 +189,8 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS
- **Neoverse N2**: preliminary support
- **Neoverse V2**: preliminary support
- **Neoverse V3**: preliminary support
- **Neoverse V3AE**: preliminary support
- **Apple Vortex**: preliminary support based on ThunderX2/3
- **Apple VortexM4**: preliminary support based on ThunderX2/3, SME kernels for SGEMM,SSYMM,STRMM,SSYRK,SSYR2K
- **A64FX**: preliminary support, optimized Level-3 BLAS
Expand Down
2 changes: 1 addition & 1 deletion c_check
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ case "$architecture" in
defined=1
;;
arm|arm64) defined=1 ;;
zarch|e2k|alpha|ia64|riscv64|loonarch64|wasm)
zarch|e2k|alpha|ia64|riscv64|loongarch64|wasm)
defined=1
BINARY=64
;;
Expand Down
2 changes: 2 additions & 0 deletions cmake/arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,11 @@ if (DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4)
endif()
endif()
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
Expand Down
1 change: 1 addition & 0 deletions cmake/cc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
endif ()

if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
set (GCC_VERSION 100)
if (POWER)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
elseif (X86_64)
Expand Down
14 changes: 8 additions & 6 deletions cmake/lapack.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ set(SLASRC
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
slarrv.f slartv.f
slarf1f.f slarf1l.f slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f
Expand Down Expand Up @@ -178,6 +178,7 @@ set(CLASRC
claqz0.f claqz1.f claqz2.f claqz3.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
clarf1f.f clarf1l.f
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
Expand Down Expand Up @@ -262,7 +263,7 @@ set(DLASRC
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
dlargv.f dlarrv.f dlartv.f
dlarf1f.f dlarf1l.f dlargv.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlasy2.f
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f
Expand Down Expand Up @@ -371,7 +372,7 @@ set(ZLASRC
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
zlarfg.f zlarfgp.f zlarft.f
zlarfg.f zlarfgp.f zlarft.f zlarf1f.f zlarf1l.f
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
Expand Down Expand Up @@ -575,7 +576,7 @@ set(SLASRC
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
slarrv.c slartv.c
slarf1f.c slarf1l.c slarrv.c slartv.c
slarz.c slarzb.c slarzt.c slasy2.c
slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c
slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c
Expand Down Expand Up @@ -681,6 +682,7 @@ set(CLASRC
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
clarf1f.c clarf1l.c
clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c
clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c
clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c
Expand Down Expand Up @@ -764,7 +766,7 @@ set(DLASRC
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
dlargv.c dlarrv.c dlartv.c
dlarf1f.c dlarf1l.c dlargv.c dlarrv.c dlartv.c
dlarz.c dlarzb.c dlarzt.c dlasy2.c
dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c
dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c
Expand Down Expand Up @@ -871,7 +873,7 @@ set(ZLASRC
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c zlarf1f.c zlarf1l.c
zlarfg.c zlarfgp.c zlarft.c
zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c
zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c
Expand Down
5 changes: 3 additions & 2 deletions cpuid_arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,10 @@ int detect(void)
return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd4e")) //X3
return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al.
else if (strstr(cpu_part, "0xd4f"))
return CPU_NEOVERSEV2;
else if (strstr(cpu_part, "0xd87") || strstr(cpu_part, "0xd85") || strstr(cpu_part, "0xd83")) // X925/A725
else if (strstr(cpu_part, "0xd87") || strstr(cpu_part, "0xd85") // A725,X925
|| strstr(cpu_part, "0xd84") || strstr(cpu_part, "0xd83")) // V3,V3AE
return CPU_NEOVERSEV2;
else if (strstr(cpu_part, "0xd0b"))
return CPU_CORTEXA76;
Expand Down
81 changes: 40 additions & 41 deletions cpuid_mips.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
Copyright (c) 2011-2026, The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand All @@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
Expand Down Expand Up @@ -109,7 +109,7 @@ int detect(void){
return CPU_1004K;
} else if (strstr(p, " 24K")) {
return CPU_24K;
} else
} else
return CPU_UNKNOWN;
}
#endif
Expand All @@ -136,6 +136,40 @@ void get_subdirname(void){
printf("mips");
}

int get_feature(char *search) {

#ifdef __linux
FILE *infile;
char buffer[2048], *p, *t;
p = (char *)NULL;

infile = fopen("/proc/cpuinfo", "r");

while (fgets(buffer, sizeof(buffer), infile)) {

if (!strncmp("Features", buffer, 8) ||
!strncmp("ASEs implemented", buffer, 16)) {
p = strchr(buffer, ':') + 2;
break;
}
}

fclose(infile);

if (p == NULL)
return 0;

t = strtok(p, " ");
while (t = strtok(NULL, " ")) {
if (strstr(t, search)) {
return (1);
}
}

#endif
return (0);
}

void get_cpuconfig(void){
if(detect()==CPU_P5600){
printf("#define P5600\n");
Expand Down Expand Up @@ -165,7 +199,7 @@ void get_cpuconfig(void){
}else{
printf("#define UNKNOWN\n");
}
#ifndef NO_MSA
#ifndef NO_MSA
if (get_feature("msa")) printf("#define HAVE_MSA\n");
#endif
}
Expand All @@ -181,38 +215,3 @@ void get_libname(void){
printf("mips\n");
}
}

int get_feature(char *search)
{

#ifdef __linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;

infile = fopen("/proc/cpuinfo", "r");

while (fgets(buffer, sizeof(buffer), infile))
{

if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
{
p = strchr(buffer, ':') + 2;
break;
}
}

fclose(infile);

if( p == NULL ) return 0;

t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (strstr(t, search)) { return(1); }
}

#endif
return(0);
}

Loading
Loading