diff --git a/ChangeLog b/ChangeLog index dfa968fda8..4b69015dbd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +Feb 5, 2026: version 7.12.2 + + * GraphBLAS 10.3.1: bug and documentation fixes, performance improvements, + and printing of user-defined types using get/set with + GxB_PRINT_FUNCTION. + * SuiteSparse_config 7.12.2: checking for BLAS libraries; better handling + of BLA_VENDOR input parameter to cmake + * Package versions in this release: (* denotes a new version) + SuiteSparse_config 7.12.2 * + AMD 3.3.4 + BTF 2.3.3 + CAMD 3.3.5 + CCOLAMD 3.3.5 + CHOLMOD 5.3.4 + COLAMD 3.3.5 + CSparse 4.3.2 + CXSparse 4.4.2 + Example 1.8.10 + GraphBLAS 10.3.1 * + KLU 2.3.6 + LDL 3.3.3 + LAGraph 1.2.1 + SuiteSparse_Mongoose 3.3.6 + ParU 1.1.0 + RBio 4.3.5 + SPEX 3.2.4 + SPQR 4.3.6 + UMFPACK 6.3.7 + Nov 4, 2025: version 7.12.1 * SuiteSparse_config 7.12.1: fix for ninja; checking BLAS properties diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt index 33827180be..45131cc00b 100644 --- a/GraphBLAS/CMakeLists.txt +++ b/GraphBLAS/CMakeLists.txt @@ -62,7 +62,8 @@ if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA ) # FOR NOW: do not compile FactoryKernels when developing the CUDA kernels set ( GRAPHBLAS_COMPACT ON ) message ( STATUS "GraphBLAS CUDA JIT: enabled") - enable_language ( CUDA ) + set ( CMAKE_CXX_STANDARD 17 ) + enable_language ( CXX CUDA ) set ( GRAPHBLAS_HAS_CUDA ON ) else ( ) message ( STATUS "GraphBLAS CUDA JIT: disabled") @@ -76,37 +77,84 @@ endif ( ) include ( GraphBLAS_JIT_paths ) +#------------------------------------------------------------------------------- +# CUDA +#------------------------------------------------------------------------------- + if ( GRAPHBLAS_HAS_CUDA ) - # with CUDA and RMM - add_subdirectory ( CUDA ) - set ( GB_CUDA GraphBLAS_CUDA ${CUDA_LIBRARIES} ) - set ( GB_RMM RMM_wrap ${CUDA_LIBRARIES} ) - add_subdirectory ( rmm_wrap ) - include_directories ( "rmm_wrap" ${CUDA_INCLUDE_DIRS} - "/usr/local/cuda/include/cub" ) - link_directories ( "CUDA" "${CUDA_LIBRARIES}" - "/usr/local/cuda/lib64/stubs" "rmm_wrap" "/usr/local/cuda/lib64" ) + + # with CUDA, and supporting libraries: Rapids Memory Manager (rmm), and spdlog + cmake_policy ( SET CMP0135 NEW ) # URL download timestamp policy + find_package ( CUDAToolkit REQUIRED ) + set ( CMAKE_CUDA_FLAGS "-cudart=static -lineinfo -Wno-deprecated-gpu-targets " ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++17 -fPIC " ) + message ( STATUS "C++ flags for CUDA: ${CMAKE_CXX_FLAGS}" ) + message ( STATUS "CUDA include dirs: ${CUDA_INCLUDE_DIRS}" ) + message ( STATUS "CUDA libraries: ${CUDA_LIBRARIES}" ) + set ( GB_CUDA ${CUDA_LIBRARIES} ) + + include_directories ( "rmm_wrap" "CUDA/include" "CUDA" ${CUDA_INCLUDE_DIRS} ) + + set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes ) + + if ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY} ) + file ( MAKE_DIRECTORY ${EXTERNAL_INCLUDES_DIRECTORY} ) + endif ( ) + + if ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/spdlog ) + message ( STATUS "cloning spdlog v1.10.0" ) + execute_process ( + COMMAND git clone "https://github.com/gabime/spdlog" --branch v1.10.0 --recursive spdlog + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes ) + endif ( ) + + if ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/rmm ) + message ( STATUS "cloning rmm branch-21.10" ) + execute_process ( + COMMAND git clone "https://github.com/rapidsai/rmm" --branch branch-21.10 --recursive rmm +# COMMAND git clone "https://github.com/rapidsai/rmm" --branch branch-25.10 --recursive rmm + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes ) + endif() + + set ( RMM_WRAP_INCLUDES ${PROJECT_BINARY_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/external_includes/spdlog/include +# for v21.10: + ${CMAKE_CURRENT_BINARY_DIR}/external_includes/rmm/include +# for v25.10: +# ${CMAKE_CURRENT_BINARY_DIR}/external_includes/rmm/cpp/include + ${CUDA_INCLUDE_DIRS} ) + else ( ) - # without CUDA and RMM - set ( CMAKE_RMM_FLAG " " ) + + # without CUDA set ( GB_CUDA ) - set ( GB_RMM ) + set ( RMM_WRAP_INCLUDES ) + endif ( ) #------------------------------------------------------------------------------- -# find OpenMP +# OpenMP #------------------------------------------------------------------------------- option ( GRAPHBLAS_USE_OPENMP "ON: Use OpenMP in GraphBLAS if available. OFF: Do not use OpenMP. (Default: SUITESPARSE_USE_OPENMP)" ${SUITESPARSE_USE_OPENMP} ) if ( GRAPHBLAS_USE_OPENMP ) if ( CMAKE_VERSION VERSION_LESS 3.24 ) - find_package ( OpenMP COMPONENTS C ) + if ( GRAPHBLAS_HAS_CUDA ) + find_package ( OpenMP COMPONENTS C CXX ) + else ( ) + find_package ( OpenMP COMPONENTS C ) + endif ( ) else ( ) - find_package ( OpenMP COMPONENTS C GLOBAL ) + if ( GRAPHBLAS_HAS_CUDA ) + find_package ( OpenMP COMPONENTS C CXX GLOBAL ) + else ( ) + find_package ( OpenMP COMPONENTS C GLOBAL ) + endif ( ) endif ( ) else ( ) # OpenMP has been disabled. set ( OpenMP_C_FOUND OFF ) + set ( OpenMP_CXX_FOUND OFF ) endif ( ) if ( GRAPHBLAS_USE_OPENMP AND OpenMP_C_FOUND ) @@ -122,7 +170,7 @@ if ( SUITESPARSE_USE_STRICT AND GRAPHBLAS_USE_OPENMP AND NOT GRAPHBLAS_HAS_OPENM endif ( ) #------------------------------------------------------------------------------- -# find cpu_features +# cpu_features #------------------------------------------------------------------------------- if ( NOT GBNCPUFEAT ) @@ -185,7 +233,7 @@ if ( DEFINED GBRVV ) endif ( ) #------------------------------------------------------------------------------- -# check compiler features +# check compiler support for the complex data types #------------------------------------------------------------------------------- include ( GraphBLAS_complex ) @@ -214,7 +262,6 @@ endif ( ) configure_file ( "Config/GraphBLAS.h.in" "${PROJECT_SOURCE_DIR}/Include/GraphBLAS.h" NEWLINE_STYLE LF ) - configure_file ( "Config/GraphBLAS_version.tex.in" "${PROJECT_SOURCE_DIR}/Doc/GraphBLAS_version.tex" NEWLINE_STYLE LF ) @@ -226,7 +273,7 @@ configure_file ( "Config/README.md.in" NEWLINE_STYLE LF ) #------------------------------------------------------------------------------- -# include directories for both graphblas and the demos +# include directories for both GraphBLAS and the demos #------------------------------------------------------------------------------- include_directories ( ${PROJECT_SOURCE_DIR} Source Include Config @@ -266,6 +313,20 @@ include_directories ( ${PROJECT_SOURCE_DIR} Source Include Config Source/type Source/wait Source/werk + # include all CUDA/* folders that have include/ or template/ subfolders: + CUDA + CUDA/apply + CUDA/cumsum + CUDA/device + CUDA/include + CUDA/init + CUDA/matrix + CUDA/monoid + CUDA/mxm + CUDA/reduce + CUDA/select + CUDA/slice + CUDA/type ) #------------------------------------------------------------------------------- @@ -275,10 +336,17 @@ include_directories ( ${PROJECT_SOURCE_DIR} Source Include Config include ( GraphBLAS_compiler_options ) #------------------------------------------------------------------------------- -# dynamic graphblas library properties +# GraphBLAS source code #------------------------------------------------------------------------------- -file ( GLOB GRAPHBLAS_SOURCES "PreJIT/*.c" "Config/*.c" "Source/*/*.c" ) +if ( GRAPHBLAS_HAS_CUDA ) + file ( GLOB GRAPHBLAS_SOURCES + "rmm_wrap/rmm_wrap.cpp" + "CUDA/*/*.cu" "CUDA/*/*.c" "CUDA/*/*.cpp" + "PreJIT/*.c" "Config/*.c" "Source/*/*.c" ) +else ( ) + file ( GLOB GRAPHBLAS_SOURCES "PreJIT/*.c" "Config/*.c" "Source/*/*.c" ) +endif ( ) if ( NOT GRAPHBLAS_COMPACT ) # compile the FactoryKernels @@ -286,6 +354,10 @@ if ( NOT GRAPHBLAS_COMPACT ) list ( APPEND GRAPHBLAS_SOURCES ${GRAPHBLAS_FACTORYKERNELS} ) endif ( ) +#------------------------------------------------------------------------------- +# GraphBLAS JITpackage +#------------------------------------------------------------------------------- + if ( GRAPHBLAS_USE_JIT ) # generate compressed JIT sources to create GB_JITpackage.c message ( STATUS "Creating the GraphBLAS/JITpackage:" ) @@ -303,7 +375,13 @@ else ( ) list ( PREPEND GRAPHBLAS_SOURCES "JITpackage/GB_JITpackage.c") endif ( ) +#------------------------------------------------------------------------------- +# dynamic GraphBLAS library properties +#------------------------------------------------------------------------------- + if ( BUILD_SHARED_LIBS ) + + # create the dynamic GraphBLAS library and set its properties add_library ( GraphBLAS SHARED ${GRAPHBLAS_SOURCES} ) set_target_properties ( GraphBLAS PROPERTIES @@ -315,6 +393,13 @@ if ( BUILD_SHARED_LIBS ) PUBLIC_HEADER "Include/GraphBLAS.h" WINDOWS_EXPORT_ALL_SYMBOLS ON ) + if ( GRAPHBLAS_HAS_CUDA ) + set_target_properties ( GraphBLAS PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON ) + set_target_properties ( GraphBLAS PROPERTIES CUDA_SEPARABLE_COMPILATION ON ) + endif ( ) + if ( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.25" ) set_target_properties ( GraphBLAS PROPERTIES EXPORT_NO_SYSTEM ON ) endif ( ) @@ -323,14 +408,15 @@ if ( BUILD_SHARED_LIBS ) INTERFACE $ $ ) - if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA ) - add_dependencies ( GraphBLAS GraphBLAS_CUDA ) - add_dependencies ( GraphBLAS RMM_wrap ) + if ( GRAPHBLAS_HAS_CUDA ) target_compile_definitions ( GraphBLAS PRIVATE "GRAPHBLAS_HAS_CUDA" ) + target_include_directories ( GraphBLAS PRIVATE "${RMM_WRAP_INCLUDES}" ) +# required for rmm 25.10: +# target_compile_definitions ( GraphBLAS PRIVATE "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE" ) endif ( ) if ( WIN32 ) - # Compiling the graphblas dll on Windows: export the dll symbols + # Compiling the GraphBLAS dll on Windows: export the dll symbols target_compile_definitions ( GraphBLAS PRIVATE GB_DLL_EXPORT ) endif ( ) @@ -344,11 +430,12 @@ if ( BUILD_SHARED_LIBS ) endif ( ) #------------------------------------------------------------------------------- -# static graphblas library properties +# static GraphBLAS library properties #------------------------------------------------------------------------------- if ( BUILD_STATIC_LIBS ) + # create the static GraphBLAS library and set its properties add_library ( GraphBLAS_static STATIC ${GRAPHBLAS_SOURCES} ) set_target_properties ( GraphBLAS_static PROPERTIES @@ -357,6 +444,13 @@ if ( BUILD_STATIC_LIBS ) C_STANDARD_REQUIRED ON PUBLIC_HEADER "Include/GraphBLAS.h" ) + if ( GRAPHBLAS_HAS_CUDA ) + set_target_properties ( GraphBLAS_static PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON ) + set_target_properties ( GraphBLAS_static PROPERTIES CUDA_SEPARABLE_COMPILATION ON ) + endif ( ) + if ( MSVC OR ("${CMAKE_C_SIMULATE_ID}" STREQUAL "MSVC") ) set_target_properties ( GraphBLAS_static PROPERTIES OUTPUT_NAME graphblas_static ) @@ -370,11 +464,11 @@ if ( BUILD_STATIC_LIBS ) INTERFACE $ $ ) - if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA ) - add_dependencies ( GraphBLAS_static GraphBLAS_CUDA ) - set ( GRAPHBLAS_STATIC_MODULES "${GRAPHBLAS_STATIC_MODULES} GraphBLAS_CUDA" ) - add_dependencies ( GraphBLAS_static RMM_wrap ) + if ( GRAPHBLAS_HAS_CUDA ) target_compile_definitions ( GraphBLAS_static PRIVATE "GRAPHBLAS_HAS_CUDA" ) +# required for rmm 25.10: +# target_compile_definitions ( GraphBLAS_static PRIVATE "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE" ) + target_include_directories ( GraphBLAS_static PRIVATE "${RMM_WRAP_INCLUDES}" ) endif ( ) if ( WIN32 ) @@ -428,7 +522,7 @@ if ( NOT GBNCPUFEAT ) endif ( ) #------------------------------------------------------------------------------- -# select the math library (not required for Microsoft Visual Studio) +# determine which basic libraries are needed (m, dl, and atomic) #------------------------------------------------------------------------------- # libm: @@ -468,10 +562,9 @@ if ( LIBATOMIC_REQUIRED ) endif ( ) #------------------------------------------------------------------------------- -# add the OpenMP, IPP, CUDA, BLAS, etc libraries +# get the current library list, before linking with OpenMP, CUDA and rmm #------------------------------------------------------------------------------- -# get the current library list, before linking with OpenMP, CUDA and rmm if ( BUILD_SHARED_LIBS ) get_target_property ( GB_CMAKE_LIBRARIES GraphBLAS LINK_LIBRARIES ) else ( ) @@ -481,16 +574,26 @@ if ( NOT GB_CMAKE_LIBRARIES ) set ( GB_CMAKE_LIBRARIES "" ) endif ( ) +#------------------------------------------------------------------------------- +# add the OpenMP, IPP, CUDA, BLAS, etc libraries +#------------------------------------------------------------------------------- + if ( GRAPHBLAS_HAS_OPENMP ) message ( STATUS "CMAKE OpenMP libraries: ${OpenMP_C_LIBRARIES}" ) message ( STATUS "CMAKE OpenMP include: ${OpenMP_C_INCLUDE_DIRS}" ) list ( APPEND GB_CMAKE_LIBRARIES ${OpenMP_C_LIBRARIES} ) if ( BUILD_SHARED_LIBS ) target_link_libraries ( GraphBLAS PRIVATE OpenMP::OpenMP_C ) + if ( GRAPHBLAS_HAS_CUDA ) + target_link_libraries ( GraphBLAS PRIVATE OpenMP::OpenMP_CXX ) + endif ( ) endif ( ) if ( BUILD_STATIC_LIBS ) list ( APPEND GRAPHBLAS_STATIC_LIBS ${OpenMP_C_LIBRARIES} ) target_link_libraries ( GraphBLAS_static PRIVATE OpenMP::OpenMP_C ) + if ( GRAPHBLAS_HAS_CUDA ) + target_link_libraries ( GraphBLAS_static PRIVATE OpenMP::OpenMP_CXX ) + endif ( ) endif ( ) message ( STATUS "CMAKE OpenMP C flags: ${OpenMP_C_FLAGS}" ) set ( GB_OPENMP_C_FLAGS "${OpenMP_C_FLAGS}" ) @@ -517,12 +620,32 @@ else ( ) set ( GB_OPENMP_C_FLAGS "" ) endif ( ) -if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA ) +#------------------------------------------------------------------------------- +# link CUDA libraries +#------------------------------------------------------------------------------- + +if ( GRAPHBLAS_HAS_CUDA ) if ( BUILD_SHARED_LIBS ) - target_link_libraries ( GraphBLAS PRIVATE ${GB_CUDA} ${GB_RMM} ) + target_link_libraries ( GraphBLAS PRIVATE ${GB_CUDA} ) + target_link_libraries ( GraphBLAS PRIVATE CUDA::nvrtc CUDA::cudart_static CUDA::cuda_driver ) + if ( TARGET CUDA::nvToolsExt ) + target_link_libraries ( GraphBLAS PRIVATE CUDA::nvToolsExt ) + endif ( ) + if ( TARGET CUDA::nvtx3 ) + target_link_libraries ( GraphBLAS PRIVATE CUDA::nvtx3 ) + target_compile_definitions ( GraphBLAS PRIVATE GBNVTX ) + endif ( ) endif ( ) if ( BUILD_STATIC_LIBS ) - target_link_libraries ( GraphBLAS_static PUBLIC ${GB_CUDA} ${GB_RMM} ) + target_link_libraries ( GraphBLAS_static PUBLIC ${GB_CUDA} ) + target_link_libraries ( GraphBLAS_static PRIVATE CUDA::nvrtc CUDA::cudart_static CUDA::cuda_driver ) + if ( TARGET CUDA::nvToolsExt ) + target_link_libraries ( GraphBLAS_static PRIVATE CUDA::nvToolsExt ) + endif ( ) + if ( TARGET CUDA::nvtx3 ) + target_link_libraries ( GraphBLAS_static PRIVATE CUDA::nvtx3 ) + target_compile_definitions ( GraphBLAS_static PRIVATE GBNVTX ) + endif ( ) endif ( ) endif ( ) @@ -571,18 +694,23 @@ if ( SUITESPARSE_DEMOS ) target_link_libraries ( grow_demo PUBLIC GraphBLAS_static ) endif ( ) - target_link_libraries ( complex_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) - target_link_libraries ( simple_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) - target_link_libraries ( wildtype_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) - target_link_libraries ( wathen_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) - target_link_libraries ( context_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) - target_link_libraries ( gauss_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) - target_link_libraries ( grow_demo PUBLIC ${GB_M} ${GB_CUDA} ${GB_RMM} ) + target_link_libraries ( complex_demo PUBLIC ${GB_M} ${GB_CUDA} ) + target_link_libraries ( simple_demo PUBLIC ${GB_M} ${GB_CUDA} ) + target_link_libraries ( wildtype_demo PUBLIC ${GB_M} ${GB_CUDA} ) + target_link_libraries ( wathen_demo PUBLIC ${GB_M} ${GB_CUDA} ) + target_link_libraries ( context_demo PUBLIC ${GB_M} ${GB_CUDA} ) + target_link_libraries ( gauss_demo PUBLIC ${GB_M} ${GB_CUDA} ) + target_link_libraries ( grow_demo PUBLIC ${GB_M} ${GB_CUDA} ) if ( GRAPHBLAS_HAS_OPENMP ) target_link_libraries ( wathen_demo PUBLIC OpenMP::OpenMP_C ) target_link_libraries ( context_demo PUBLIC OpenMP::OpenMP_C ) target_link_libraries ( grow_demo PUBLIC OpenMP::OpenMP_C ) + if ( GRAPHBLAS_HAS_CUDA ) + target_link_libraries ( wathen_demo PUBLIC OpenMP::OpenMP_CXX ) + target_link_libraries ( context_demo PUBLIC OpenMP::OpenMP_CXX ) + target_link_libraries ( grow_demo PUBLIC OpenMP::OpenMP_CXX ) + endif ( ) endif ( ) else ( ) @@ -722,7 +850,7 @@ if ( NOT MSVC ) endif ( ) #------------------------------------------------------------------------------- -# configure the JITs +# configure the JIT #------------------------------------------------------------------------------- include ( GraphBLAS_JIT_configure ) diff --git a/GraphBLAS/CUDA/CMakeLists.txt b/GraphBLAS/CUDA/CMakeLists.txt deleted file mode 100644 index 6fd80479e5..0000000000 --- a/GraphBLAS/CUDA/CMakeLists.txt +++ /dev/null @@ -1,158 +0,0 @@ -#------------------------------------------------------------------------------- -# GraphBLAS/CUDA/CMakeLists.txt: cmake script for GraphBLAS/CUDA -#------------------------------------------------------------------------------- - -# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved. - -# Some files in this folder are (c) NVIDIA or (c) Google. Please refer -# to their individual licenses (Apache, BSD, or others). -# SPDX-License-Identifier: Apache-2.0 - -#------------------------------------------------------------------------------- - -cmake_minimum_required ( VERSION 3.20 ) # GraphBLAS can be built stand-alone - -project ( GRAPHBLAS_CUDA - VERSION "${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}" - LANGUAGES CXX CUDA ) - -if ( CMAKE_VERSION VERSION_GREATER_EQUAL 3.24 ) - # requires cmake 3.24: - cmake_policy ( SET CMP0135 NEW ) # URL download timestamp policy -endif ( ) - -set ( CMAKE_CXX_STANDARD 17 ) - -set ( CMAKE_CUDA_FLAGS "-cudart=static -lineinfo " ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++17 -fPIC " ) - -add_compile_definitions ( GBNCPUFEAT ) - -message ( STATUS "C++ flags for CUDA: ${CMAKE_CXX_FLAGS}" ) - -file ( GLOB GRAPHBLAS_CUDA_SOURCES "*.cu" "*.c" "*.cpp" ) - -add_library ( GraphBLAS_CUDA SHARED ${GRAPHBLAS_CUDA_SOURCES} ) - -set_target_properties ( GraphBLAS_CUDA PROPERTIES - VERSION ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} - OUTPUT_NAME graphblascuda - SOVERSION ${GraphBLAS_VERSION_MAJOR} - C_STANDARD 11 - C_STANDARD_REQUIRED ON ) - -#------------------------------------------------------------------------------- -# GraphBLAS_CUDA properties -#------------------------------------------------------------------------------- - -target_include_directories ( GraphBLAS_CUDA PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/../ - include - ../Include - ../rmm_wrap - ../Source - ../Source/hyper - ../Source/builtin - ) - -set_target_properties ( GraphBLAS_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON ) -set_target_properties ( GraphBLAS_CUDA PROPERTIES CUDA_SEPARABLE_COMPILATION ON ) - -target_link_libraries ( GraphBLAS_CUDA PRIVATE CUDA::nvrtc CUDA::cudart_static CUDA::cuda_driver ) - -if ( TARGET CUDA::nvToolsExt ) - target_link_libraries ( GraphBLAS_CUDA PRIVATE CUDA::nvToolsExt ) -endif ( ) - -if ( TARGET CUDA::nvtx3 ) - target_link_libraries ( GraphBLAS_CUDA PRIVATE CUDA::nvtx3 ) - target_compile_definitions ( GraphBLAS_CUDA PRIVATE GBNVTX ) -endif ( ) - -target_compile_definitions ( GraphBLAS_CUDA PUBLIC "GRAPHBLAS_HAS_CUDA" ) - -if ( OpenMP_CXX_FOUND ) - target_include_directories ( GraphBLAS_CUDA PRIVATE OpenMP::OpenMP_CXX ) -endif ( ) - -target_include_directories ( GraphBLAS_CUDA - INTERFACE $ - $ ) - -#------------------------------------------------------------------------------- -# installation location -#------------------------------------------------------------------------------- - -include ( CMakePackageConfigHelpers ) - -install ( TARGETS GraphBLAS_CUDA - EXPORT GraphBLAS_CUDATargets - LIBRARY DESTINATION ${SUITESPARSE_LIBDIR} - ARCHIVE DESTINATION ${SUITESPARSE_LIBDIR} - RUNTIME DESTINATION ${SUITESPARSE_BINDIR} - PUBLIC_HEADER DESTINATION ${SUITESPARSE_INCLUDEDIR} ) - -# create (temporary) export target file during build -export ( EXPORT GraphBLAS_CUDATargets - NAMESPACE SuiteSparse:: - FILE ${CMAKE_CURRENT_BINARY_DIR}/GraphBLAS_CUDATargets.cmake ) - -# install export target and config for find_package -install ( EXPORT GraphBLAS_CUDATargets - NAMESPACE SuiteSparse:: - DESTINATION ${SUITESPARSE_PKGFILEDIR}/cmake/GraphBLAS ) - -configure_package_config_file ( - Config/GraphBLAS_CUDAConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/GraphBLAS_CUDAConfig.cmake - INSTALL_DESTINATION ${SUITESPARSE_PKGFILEDIR}/cmake/GraphBLAS ) - -write_basic_package_version_file ( - ${CMAKE_CURRENT_BINARY_DIR}/GraphBLAS_CUDAConfigVersion.cmake - COMPATIBILITY SameMajorVersion ) - -install ( FILES - ${CMAKE_CURRENT_BINARY_DIR}/GraphBLAS_CUDAConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/GraphBLAS_CUDAConfigVersion.cmake - DESTINATION ${SUITESPARSE_PKGFILEDIR}/cmake/GraphBLAS ) - -#------------------------------------------------------------------------------- -# create pkg-config file -#------------------------------------------------------------------------------- - -if ( NOT MSVC ) - set ( prefix "${CMAKE_INSTALL_PREFIX}" ) - set ( exec_prefix "\${prefix}" ) - cmake_path ( IS_ABSOLUTE SUITESPARSE_LIBDIR SUITESPARSE_LIBDIR_IS_ABSOLUTE ) - if (SUITESPARSE_LIBDIR_IS_ABSOLUTE) - set ( libdir "${SUITESPARSE_LIBDIR}") - else ( ) - set ( libdir "\${exec_prefix}/${SUITESPARSE_LIBDIR}") - endif ( ) - cmake_path ( IS_ABSOLUTE SUITESPARSE_INCLUDEDIR SUITESPARSE_INCLUDEDIR_IS_ABSOLUTE ) - if (SUITESPARSE_INCLUDEDIR_IS_ABSOLUTE) - set ( includedir "${SUITESPARSE_INCLUDEDIR}") - else ( ) - set ( includedir "\${prefix}/${SUITESPARSE_INCLUDEDIR}") - endif ( ) - configure_file ( - Config/GraphBLAS_CUDA.pc.in - GraphBLAS_CUDA.pc - @ONLY - NEWLINE_STYLE LF ) - install ( FILES - ${CMAKE_CURRENT_BINARY_DIR}/GraphBLAS_CUDA.pc - DESTINATION ${SUITESPARSE_PKGFILEDIR}/pkgconfig ) -endif ( ) - -#------------------------------------------------------------------------------- -# report -#------------------------------------------------------------------------------- - -message ( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER} ") -message ( STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS} ") -message ( STATUS "CMAKE_CUDA_FLAGS_RELEASE: ${CMAKE_CUDA_FLAGS_RELEASE} ") -message ( STATUS "CMAKE_CUDA_FLAGS_DEBUG: ${CMAKE_CUDA_FLAGS_DEBUG} ") - - diff --git a/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in b/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in deleted file mode 100644 index a04257bf97..0000000000 --- a/GraphBLAS/CUDA/Config/GraphBLAS_CUDA.pc.in +++ /dev/null @@ -1,17 +0,0 @@ -# GraphBLAS_CUDA, Copyright (c) 2017-2025, FIXME -# All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ - -# FIXME: Which flags do we need to statically link CUDA? - -Name: GraphBLAS_CUDA -URL: https://github.com/DrTimothyAldenDavis/SuiteSparse -Description: CUDA support library for GraphBLAS in SuiteSparse -Version: @GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@GraphBLAS_VERSION_SUB@ -Libs: -L${libdir} -lgraphblascuda@CMAKE_RELEASE_POSTFIX@ -Cflags: -I${includedir} -DGRAPHBLAS_HAS_CUDA diff --git a/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in b/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in deleted file mode 100644 index b5249120c2..0000000000 --- a/GraphBLAS/CUDA/Config/GraphBLAS_CUDAConfig.cmake.in +++ /dev/null @@ -1,139 +0,0 @@ -#------------------------------------------------------------------------------- -# SuiteSparse/GraphBLAS/cmake_modules/GraphBLASConfig.cmake -#------------------------------------------------------------------------------- - -# The following copyright and license applies to just this file only, not to -# the library itself: -# GraphBLASConfig.cmake, Copyright (c) 2023-2025, FIXME -# SPDX-License-Identifier: BSD-3-clause - -#------------------------------------------------------------------------------- - -# Finds the GraphBLAS_CUDA include file and compiled library. -# The following targets are defined: -# SuiteSparse::GRAPHBLAS_CUDA - for the shared library (if available) -# SuiteSparse::GRAPHBLAS_CUDA_static - for the static library (if available) - -# For backward compatibility the following variables are set: - -# GRAPHBLAS_CUDA_INCLUDE_DIR - where to find GraphBLAS.h, etc. -# GRAPHBLAS_CUDA_LIBRARY - dynamic GraphBLAS library -# GRAPHBLAS_CUDA_STATIC - static GraphBLAS library -# GRAPHBLAS_CUDA_LIBRARIES - libraries when using GraphBLAS -# GRAPHBLAS_CUDA_FOUND - true if GraphBLAS found - -# Set ``CMAKE_MODULE_PATH`` to the parent folder where this module file is -# installed. - -#------------------------------------------------------------------------------- - -@PACKAGE_INIT@ - -set ( GRAPHBLAS_CUDA_DATE "@GraphBLAS_DATE@" ) -set ( GRAPHBLAS_CUDA_VERSION_MAJOR @GraphBLAS_VERSION_MAJOR@ ) -set ( GRAPHBLAS_CUDA_VERSION_MINOR @GraphBLAS_VERSION_MINOR@ ) -set ( GRAPHBLAS_CUDA_VERSION_PATCH @GraphBLAS_VERSION_SUB@ ) -set ( GRAPHBLAS_CUDA_VERSION "@GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@GraphBLAS_VERSION_SUB@" ) - -# Check for dependent targets -include ( CMakeFindDependencyMacro ) -set ( _dependencies_found ON ) - -# Look for NVIDIA CUDA toolkit -if ( NOT CUDAToolkit_FOUND ) - find_dependency ( CUDAToolkit @CUDAToolkit_VERSION_MAJOR@ ) - if ( NOT CUDAToolkit_FOUND ) - set ( _dependencies_found OFF ) - endif ( ) -endif ( ) - -if ( NOT _dependencies_found ) - set ( GraphBLAS_CUDA_FOUND OFF ) - return ( ) -endif ( ) - -# Import target -include ( ${CMAKE_CURRENT_LIST_DIR}/GraphBLAS_CUDATargets.cmake ) - -# The following is only for backward compatibility with FindGraphBLAS_CUDA. - -set ( _target_shared SuiteSparse::GraphBLAS_CUDA ) -set ( _target_static SuiteSparse::GraphBLAS_CUDA_static ) -set ( _var_prefix "GRAPHBLAS_CUDA" ) - -get_target_property ( ${_var_prefix}_INCLUDE_DIR ${_target_shared} INTERFACE_INCLUDE_DIRECTORIES ) -if ( ${_var_prefix}_INCLUDE_DIR ) - # First item in SuiteSparse targets contains the "main" header directory. - list ( GET ${_var_prefix}_INCLUDE_DIR 0 ${_var_prefix}_INCLUDE_DIR ) -endif ( ) -get_target_property ( ${_var_prefix}_LIBRARY ${_target_shared} IMPORTED_IMPLIB ) -if ( NOT ${_var_prefix}_LIBRARY ) - get_target_property ( _library_chk ${_target_shared} IMPORTED_LOCATION ) - if ( EXISTS ${_library_chk} ) - set ( ${_var_prefix}_LIBRARY ${_library_chk} ) - endif ( ) -endif ( ) -if ( TARGET ${_target_static} ) - get_target_property ( ${_var_prefix}_STATIC ${_target_static} IMPORTED_LOCATION ) -endif ( ) - -# Check for most common build types -set ( _config_types "Debug" "Release" "RelWithDebInfo" "MinSizeRel" "None" ) - -get_property ( _isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG ) -if ( _isMultiConfig ) - # For multi-configuration generators (e.g., Visual Studio), prefer those - # configurations. - list ( PREPEND _config_types ${CMAKE_CONFIGURATION_TYPES} ) -else ( ) - # For single-configuration generators, prefer the current configuration. - list ( PREPEND _config_types ${CMAKE_BUILD_TYPE} ) -endif ( ) - -list ( REMOVE_DUPLICATES _config_types ) - -foreach ( _config ${_config_types} ) - string ( TOUPPER ${_config} _uc_config ) - if ( NOT ${_var_prefix}_LIBRARY ) - get_target_property ( _library_chk ${_target_shared} - IMPORTED_IMPLIB_${_uc_config} ) - if ( EXISTS ${_library_chk} ) - set ( ${_var_prefix}_LIBRARY ${_library_chk} ) - endif ( ) - endif ( ) - if ( NOT ${_var_prefix}_LIBRARY ) - get_target_property ( _library_chk ${_target_shared} - IMPORTED_LOCATION_${_uc_config} ) - if ( EXISTS ${_library_chk} ) - set ( ${_var_prefix}_LIBRARY ${_library_chk} ) - endif ( ) - endif ( ) - if ( TARGET ${_target_static} AND NOT ${_var_prefix}_STATIC ) - get_target_property ( _library_chk ${_target_static} - IMPORTED_LOCATION_${_uc_config} ) - if ( EXISTS ${_library_chk} ) - set ( ${_var_prefix}_STATIC ${_library_chk} ) - endif ( ) - endif ( ) -endforeach ( ) - -set ( GRAPHBLAS_CUDA_LIBRARIES ${GRAPHBLAS_CUDA_LIBRARY} ) - -macro ( suitesparse_check_exist _var _files ) - # ignore generator expressions - string ( GENEX_STRIP "${_files}" _files2 ) - - foreach ( _file ${_files2} ) - if ( NOT EXISTS "${_file}" ) - message ( FATAL_ERROR "File or directory ${_file} referenced by variable ${_var} does not exist!" ) - endif ( ) - endforeach () -endmacro ( ) - -suitesparse_check_exist ( GRAPHBLAS_CUDA_INCLUDE_DIR ${GRAPHBLAS_CUDA_INCLUDE_DIR} ) -suitesparse_check_exist ( GRAPHBLAS_CUDA_LIBRARY ${GRAPHBLAS_CUDA_LIBRARY} ) - -message ( STATUS "GraphBLAS_CUDA version: ${GRAPHBLAS_CUDA_VERSION}" ) -message ( STATUS "GraphBLAS_CUDA include: ${GRAPHBLAS_CUDA_INCLUDE_DIR}" ) -message ( STATUS "GraphBLAS_CUDA library: ${GRAPHBLAS_CUDA_LIBRARY}" ) -message ( STATUS "GraphBLAS_CUDA static: ${GRAPHBLAS_CUDA_STATIC}" ) diff --git a/GraphBLAS/CUDA/GB_cuda_apply.hpp b/GraphBLAS/CUDA/apply/GB_cuda_apply.hpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_apply.hpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply.hpp diff --git a/GraphBLAS/CUDA/GB_cuda_apply_bind1st_jit.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_bind1st_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_apply_bind1st_jit.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_bind1st_jit.cpp index ba3a03da24..9427bbc8b0 100644 --- a/GraphBLAS/CUDA/GB_cuda_apply_bind1st_jit.cpp +++ b/GraphBLAS/CUDA/apply/GB_cuda_apply_bind1st_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_apply.hpp" +#include "apply/GB_cuda_apply.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/GB_cuda_apply_bind2nd_jit.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_bind2nd_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_apply_bind2nd_jit.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_bind2nd_jit.cpp index d96b4772ad..4d8d648dc3 100644 --- a/GraphBLAS/CUDA/GB_cuda_apply_bind2nd_jit.cpp +++ b/GraphBLAS/CUDA/apply/GB_cuda_apply_bind2nd_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_apply.hpp" +#include "apply/GB_cuda_apply.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/GB_cuda_apply_binop.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_binop.cpp similarity index 95% rename from GraphBLAS/CUDA/GB_cuda_apply_binop.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_binop.cpp index 8c31a0367c..fde8886166 100644 --- a/GraphBLAS/CUDA/GB_cuda_apply_binop.cpp +++ b/GraphBLAS/CUDA/apply/GB_cuda_apply_binop.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_apply.hpp" +#include "apply/GB_cuda_apply.hpp" #undef GB_FREE_WORKSPACE #define GB_FREE_WORKSPACE \ @@ -10,7 +10,7 @@ #define GB_FREE_ALL \ { \ GB_FREE_WORKSPACE ; \ - GB_cuda_stream_pool_release (&stream) ; \ + GB_cuda_stream_pool_release (&stream) ; \ } #define BLOCK_SIZE 512 diff --git a/GraphBLAS/CUDA/GB_cuda_apply_binop_branch.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_binop_branch.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_apply_binop_branch.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_binop_branch.cpp diff --git a/GraphBLAS/CUDA/GB_cuda_apply_unop.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_unop.cpp similarity index 95% rename from GraphBLAS/CUDA/GB_cuda_apply_unop.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_unop.cpp index cf0ef9309f..34a48544a9 100644 --- a/GraphBLAS/CUDA/GB_cuda_apply_unop.cpp +++ b/GraphBLAS/CUDA/apply/GB_cuda_apply_unop.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_apply.hpp" +#include "apply/GB_cuda_apply.hpp" #undef GB_FREE_WORKSPACE #define GB_FREE_WORKSPACE \ @@ -10,7 +10,7 @@ #define GB_FREE_ALL \ { \ GB_FREE_WORKSPACE \ - GB_cuda_stream_pool_release (&stream) ; \ + GB_cuda_stream_pool_release (&stream) ; \ } #define BLOCK_SIZE 512 diff --git a/GraphBLAS/CUDA/GB_cuda_apply_unop_branch.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_unop_branch.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_apply_unop_branch.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_unop_branch.cpp diff --git a/GraphBLAS/CUDA/GB_cuda_apply_unop_jit.cpp b/GraphBLAS/CUDA/apply/GB_cuda_apply_unop_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_apply_unop_jit.cpp rename to GraphBLAS/CUDA/apply/GB_cuda_apply_unop_jit.cpp index fccbf6746f..554bdc5a10 100644 --- a/GraphBLAS/CUDA/GB_cuda_apply_unop_jit.cpp +++ b/GraphBLAS/CUDA/apply/GB_cuda_apply_unop_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_apply.hpp" +#include "apply/GB_cuda_apply.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu b/GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_bind1st.cu similarity index 100% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu rename to GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_bind1st.cu diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu b/GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_bind2nd.cu similarity index 100% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu rename to GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_bind2nd.cu diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu b/GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_unop.cu similarity index 98% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu rename to GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_unop.cu index cfc752c8b7..9b9a2473bb 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_apply_unop.cu +++ b/GraphBLAS/CUDA/apply/template/GB_jit_kernel_cuda_apply_unop.cu @@ -2,7 +2,7 @@ using namespace cooperative_groups ; -#include "GB_cuda_ek_slice.cuh" +#include "template/GB_cuda_ek_slice.cuh" #define log2_chunk_size 10 #define chunk_size 1024 diff --git a/GraphBLAS/CUDA/template/GB_cuda_cumsum.cuh b/GraphBLAS/CUDA/cumsum/template/GB_cuda_cumsum.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_cumsum.cuh rename to GraphBLAS/CUDA/cumsum/template/GB_cuda_cumsum.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_threadblock_sum_uint64.cuh b/GraphBLAS/CUDA/cumsum/template/GB_cuda_threadblock_sum_uint64.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_threadblock_sum_uint64.cuh rename to GraphBLAS/CUDA/cumsum/template/GB_cuda_threadblock_sum_uint64.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_tile_sum_uint64.cuh b/GraphBLAS/CUDA/cumsum/template/GB_cuda_tile_sum_uint64.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_tile_sum_uint64.cuh rename to GraphBLAS/CUDA/cumsum/template/GB_cuda_tile_sum_uint64.cuh diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu b/GraphBLAS/CUDA/device/GB_cuda_get_device_count.cu similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_get_device_count.cu rename to GraphBLAS/CUDA/device/GB_cuda_get_device_count.cu diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu b/GraphBLAS/CUDA/device/GB_cuda_get_device_properties.cu similarity index 99% rename from GraphBLAS/CUDA/GB_cuda_get_device_properties.cu rename to GraphBLAS/CUDA/device/GB_cuda_get_device_properties.cu index b61ac74848..ce14674cff 100644 --- a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu +++ b/GraphBLAS/CUDA/device/GB_cuda_get_device_properties.cu @@ -12,6 +12,7 @@ // they are to be solely used by C functions in GraphBLAS/Source. #include "GB_cuda.hpp" + #define CU_OK(cudaMethod) \ { \ if ((cudaMethod) != cudaSuccess) \ diff --git a/GraphBLAS/CUDA/GB_cuda_stream_pool.cu b/GraphBLAS/CUDA/device/GB_cuda_stream_pool.cu similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_stream_pool.cu rename to GraphBLAS/CUDA/device/GB_cuda_stream_pool.cu diff --git a/GraphBLAS/CUDA/include/GB_cuda_kernel.cuh b/GraphBLAS/CUDA/include/GB_cuda_kernel.cuh index 466ebc6cad..5813d07337 100644 --- a/GraphBLAS/CUDA/include/GB_cuda_kernel.cuh +++ b/GraphBLAS/CUDA/include/GB_cuda_kernel.cuh @@ -42,7 +42,7 @@ using namespace cooperative_groups ; // subset of GraphBLAS.h //------------------------------------------------------------------------------ -#include "GraphBLAS_cuda.hpp" +#include "include/GraphBLAS_cuda.hpp" //------------------------------------------------------------------------------ // internal #include files @@ -76,6 +76,7 @@ extern "C" #include "include/GB_omp_kernels.h" } -#include "GB_cuda_error.hpp" -#include "GB_cuda_atomics.cuh" +#include "include/GB_cuda_error.hpp" +#include "include/GB_cuda_atomics.cuh" +#include "include/GB_cuda_timer.hpp" diff --git a/GraphBLAS/CUDA/include/GraphBLAS_cuda.hpp b/GraphBLAS/CUDA/include/GraphBLAS_cuda.hpp index cea580f670..d2c4f8915d 100644 --- a/GraphBLAS/CUDA/include/GraphBLAS_cuda.hpp +++ b/GraphBLAS/CUDA/include/GraphBLAS_cuda.hpp @@ -10,6 +10,9 @@ extern "C" { // definitions that modify GraphBLAS.h + #ifndef GBNCPUFEAT + #define GBNCPUFEAT + #endif #include "include/GB_dev.h" #include "include/GB_compiler.h" #include "include/GB_warnings.h" diff --git a/GraphBLAS/CUDA/GB_cuda_finalize.c b/GraphBLAS/CUDA/init/GB_cuda_finalize.c similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_finalize.c rename to GraphBLAS/CUDA/init/GB_cuda_finalize.c diff --git a/GraphBLAS/CUDA/GB_cuda_init.c b/GraphBLAS/CUDA/init/GB_cuda_init.c similarity index 97% rename from GraphBLAS/CUDA/GB_cuda_init.c rename to GraphBLAS/CUDA/init/GB_cuda_init.c index ca475f00f1..cd12573e58 100644 --- a/GraphBLAS/CUDA/GB_cuda_init.c +++ b/GraphBLAS/CUDA/init/GB_cuda_init.c @@ -49,7 +49,7 @@ GrB_Info GB_cuda_init (void) // of the work. Alternatively, move GB_cuda_init here (if so, // ensure that it doesn't depend on any other initializations // below). - 256 * 1000000L, 1024 * 100000000L, 1) ; // FIXME: ask the GPU(s) + 256 * 1000000L, 1024 * 100000000L /*, 1 */) ; // FIXME: ask the GPU(s) } // warm up the GPUs diff --git a/GraphBLAS/CUDA/GB_cuda_warmup.cu b/GraphBLAS/CUDA/init/GB_cuda_warmup.cu similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_warmup.cu rename to GraphBLAS/CUDA/init/GB_cuda_warmup.cu diff --git a/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp b/GraphBLAS/CUDA/matrix/GB_cuda_matrix_prefetch.cpp similarity index 99% rename from GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp rename to GraphBLAS/CUDA/matrix/GB_cuda_matrix_prefetch.cpp index 8fa6328edf..3924b16ab9 100644 --- a/GraphBLAS/CUDA/GB_cuda_matrix_prefetch.cpp +++ b/GraphBLAS/CUDA/matrix/GB_cuda_matrix_prefetch.cpp @@ -9,6 +9,7 @@ //------------------------------------------------------------------------------ #include "GB_cuda.hpp" + #define GB_FREE_ALL ; GrB_Info GB_cuda_matrix_prefetch diff --git a/GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp b/GraphBLAS/CUDA/monoid/GB_cuda_upscale_identity.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_upscale_identity.cpp rename to GraphBLAS/CUDA/monoid/GB_cuda_upscale_identity.cpp diff --git a/GraphBLAS/CUDA/GB_cuda_AxB.hpp b/GraphBLAS/CUDA/mxm/GB_cuda_AxB.hpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_AxB.hpp rename to GraphBLAS/CUDA/mxm/GB_cuda_AxB.hpp diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3.cpp similarity index 99% rename from GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3.cpp index a3d96169c5..957ef19f79 100644 --- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3.cpp @@ -20,7 +20,7 @@ GB_cuda_stream_pool_release (&stream) ; \ } -#include "GB_cuda_AxB.hpp" +#include "mxm/GB_cuda_AxB.hpp" //------------------------------------------------------------------------------ // GB_cuda_AxB_dot3 diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3_branch.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3_branch.cpp index 880cb8a381..bae3782bc5 100644 --- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_branch.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3_branch.cpp @@ -10,7 +10,6 @@ // Decide branch direction for GPU use for the dot-product C=A'*B #include "GB_cuda.hpp" -#include bool GB_cuda_AxB_dot3_branch ( diff --git a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3_jit.cpp index 52cfd7e3de..ccda3c68fa 100644 --- a/GraphBLAS/CUDA/GB_cuda_AxB_dot3_jit.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_AxB_dot3_jit.cpp @@ -7,7 +7,7 @@ //------------------------------------------------------------------------------ -#include "GB_cuda_AxB.hpp" +#include "mxm/GB_cuda_AxB.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/GB_cuda_colscale.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_colscale.cpp similarity index 89% rename from GraphBLAS/CUDA/GB_cuda_colscale.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_colscale.cpp index dc8dad7d85..6b29076aac 100644 --- a/GraphBLAS/CUDA/GB_cuda_colscale.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_colscale.cpp @@ -1,9 +1,9 @@ -#include "GB_cuda_ewise.hpp" +#include "mxm/GB_cuda_ewise.hpp" #undef GB_FREE_ALL #define GB_FREE_ALL \ { \ - GB_cuda_stream_pool_release (&stream) ; \ + GB_cuda_stream_pool_release (&stream) ; \ } #define BLOCK_SIZE 128 diff --git a/GraphBLAS/CUDA/GB_cuda_colscale_branch.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_colscale_branch.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_colscale_branch.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_colscale_branch.cpp diff --git a/GraphBLAS/CUDA/GB_cuda_colscale_jit.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_colscale_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_colscale_jit.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_colscale_jit.cpp index 31547d7892..d7e50005b6 100644 --- a/GraphBLAS/CUDA/GB_cuda_colscale_jit.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_colscale_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_ewise.hpp" +#include "mxm/GB_cuda_ewise.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/GB_cuda_ewise.hpp b/GraphBLAS/CUDA/mxm/GB_cuda_ewise.hpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_ewise.hpp rename to GraphBLAS/CUDA/mxm/GB_cuda_ewise.hpp diff --git a/GraphBLAS/CUDA/GB_cuda_rowscale.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_rowscale.cpp similarity index 89% rename from GraphBLAS/CUDA/GB_cuda_rowscale.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_rowscale.cpp index cabff7da06..ef554d2d6f 100644 --- a/GraphBLAS/CUDA/GB_cuda_rowscale.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_rowscale.cpp @@ -1,9 +1,9 @@ -#include "GB_cuda_ewise.hpp" +#include "mxm/GB_cuda_ewise.hpp" #undef GB_FREE_ALL #define GB_FREE_ALL \ { \ - GB_cuda_stream_pool_release (&stream) ; \ + GB_cuda_stream_pool_release (&stream) ; \ } #define BLOCK_SIZE 128 diff --git a/GraphBLAS/CUDA/GB_cuda_rowscale_branch.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_rowscale_branch.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_rowscale_branch.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_rowscale_branch.cpp diff --git a/GraphBLAS/CUDA/GB_cuda_rowscale_jit.cpp b/GraphBLAS/CUDA/mxm/GB_cuda_rowscale_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_rowscale_jit.cpp rename to GraphBLAS/CUDA/mxm/GB_cuda_rowscale_jit.cpp index 0ad2806c22..edca87e01c 100644 --- a/GraphBLAS/CUDA/GB_cuda_rowscale_jit.cpp +++ b/GraphBLAS/CUDA/mxm/GB_cuda_rowscale_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_ewise.hpp" +#include "mxm/GB_cuda_ewise.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase1.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase1.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase1.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase1.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase2.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase2.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase2.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase2.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase2end.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase2end.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase2end.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase2end.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh similarity index 98% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh index 4315c90639..d3cc062bcf 100644 --- a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh +++ b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh @@ -179,7 +179,7 @@ __global__ void GB_cuda_AxB_dot3_phase3_mp_kernel #define pY_end pB_end #define Yi Bi - #include "GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh" } #if 0 else @@ -199,7 +199,7 @@ __global__ void GB_cuda_AxB_dot3_phase3_mp_kernel #define Yi Ai // flip the roles of A(:,i) and B(:,j) - #include "GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh" } #endif diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_mp_guts.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_vssp.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_vssp.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_vssp.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_vssp.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh b/GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh rename to GraphBLAS/CUDA/mxm/template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu b/GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_AxB_dot3.cu similarity index 97% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu rename to GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_AxB_dot3.cu index fe83b1c7b6..949541eefd 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu +++ b/GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_AxB_dot3.cu @@ -136,19 +136,19 @@ GB_bucket_code ; // FIXME: rename GB_dot3_bucket_code // a bitmap/sparse kernel // ... -#include "GB_cuda_tile_sum_uint64.cuh" -#include "GB_cuda_tile_reduce_ztype.cuh" +#include "template/GB_cuda_tile_sum_uint64.cuh" +#include "template/GB_cuda_tile_reduce_ztype.cuh" //------------------------------------------------------------------------------ // CUDA device kernels for each case //------------------------------------------------------------------------------ -#include "GB_cuda_ek_slice.cuh" +#include "template/GB_cuda_ek_slice.cuh" #if ((GB_A_IS_BITMAP || GB_A_IS_FULL) && (GB_B_IS_BITMAP || GB_B_IS_FULL)) // dense-dense - #include "GB_cuda_jit_AxB_dot3_dense_phase1.cuh" - #include "GB_cuda_jit_AxB_dot3_phase3_dndn.cuh" + #include "template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_dndn.cuh" #else // sparse-sparse, sparse-dense, or dense-sparse @@ -161,19 +161,19 @@ GB_bucket_code ; // FIXME: rename GB_dot3_bucket_code GB_FREE_MEMORY (&Bucket, Bu_size) ; \ } - #include "GB_cuda_jit_AxB_dot3_phase1.cuh" - #include "GB_cuda_jit_AxB_dot3_phase2.cuh" - #include "GB_cuda_jit_AxB_dot3_phase2end.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase1.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase2.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase2end.cuh" #if ((GB_A_IS_SPARSE || GB_A_IS_HYPER) && \ (GB_B_IS_SPARSE || GB_B_IS_HYPER)) // sparse-sparse - #include "GB_cuda_jit_AxB_dot3_phase3_mp.cuh" - #include "GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh" - #include "GB_cuda_jit_AxB_dot3_phase3_vssp.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_mp.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_vsvs.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_vssp.cuh" #else // sparse-dense or dense-sparse - #include "GB_cuda_jit_AxB_dot3_phase3_spdn.cuh" - #include "GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_spdn.cuh" + #include "template/GB_cuda_jit_AxB_dot3_phase3_vsdn.cuh" #endif #endif @@ -181,8 +181,6 @@ GB_bucket_code ; // FIXME: rename GB_dot3_bucket_code // host function to launch the CUDA kernels for dot3 on the GPU //------------------------------------------------------------------------------ -// #include "GB_cuda_timer.hpp" - extern "C" { GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel) ; diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_colscale.cu similarity index 98% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_colscale.cu rename to GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_colscale.cu index 6d80827dc4..570509eae9 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_colscale.cu +++ b/GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_colscale.cu @@ -3,7 +3,7 @@ using namespace cooperative_groups ; // do not #include functions inside of other functions! -#include "GB_cuda_ek_slice.cuh" +#include "template/GB_cuda_ek_slice.cuh" #define log2_chunk_size 10 #define chunk_size 1024 diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_rowscale.cu similarity index 100% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_rowscale.cu rename to GraphBLAS/CUDA/mxm/template/GB_jit_kernel_cuda_rowscale.cu diff --git a/GraphBLAS/CUDA/GB_cuda_reduce.hpp b/GraphBLAS/CUDA/reduce/GB_cuda_reduce.hpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_reduce.hpp rename to GraphBLAS/CUDA/reduce/GB_cuda_reduce.hpp diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp b/GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp rename to GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar.cpp index 7864b63238..8c65a01900 100644 --- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar.cpp +++ b/GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar.cpp @@ -25,10 +25,10 @@ { \ GB_FREE_WORKSPACE ; \ GB_Matrix_free (&V) ; \ - GB_cuda_stream_pool_release (&stream) ; \ + GB_cuda_stream_pool_release (&stream) ; \ } -#include "GB_cuda_reduce.hpp" +#include "reduce/GB_cuda_reduce.hpp" GrB_Info GB_cuda_reduce_to_scalar ( diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp b/GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar_branch.cpp similarity index 97% rename from GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp rename to GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar_branch.cpp index dbdcf62c70..56c316b5df 100644 --- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_branch.cpp +++ b/GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar_branch.cpp @@ -9,7 +9,7 @@ // Decide branch direction for GPU use for the reduction to scalar -#include "GB_cuda_reduce.hpp" +#include "reduce/GB_cuda_reduce.hpp" bool GB_cuda_reduce_to_scalar_branch // return true to use the GPU ( diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp b/GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar_jit.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp rename to GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar_jit.cpp index 2b55cfcf12..74f9e927c3 100644 --- a/GraphBLAS/CUDA/GB_cuda_reduce_to_scalar_jit.cpp +++ b/GraphBLAS/CUDA/reduce/GB_cuda_reduce_to_scalar_jit.cpp @@ -7,7 +7,7 @@ //------------------------------------------------------------------------------ -#include "GB_cuda_reduce.hpp" +#include "reduce/GB_cuda_reduce.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/template/GB_cuda_threadblock_reduce_ztype.cuh b/GraphBLAS/CUDA/reduce/template/GB_cuda_threadblock_reduce_ztype.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_threadblock_reduce_ztype.cuh rename to GraphBLAS/CUDA/reduce/template/GB_cuda_threadblock_reduce_ztype.cuh diff --git a/GraphBLAS/CUDA/template/GB_cuda_tile_reduce_ztype.cuh b/GraphBLAS/CUDA/reduce/template/GB_cuda_tile_reduce_ztype.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_tile_reduce_ztype.cuh rename to GraphBLAS/CUDA/reduce/template/GB_cuda_tile_reduce_ztype.cuh diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_reduce.cu b/GraphBLAS/CUDA/reduce/template/GB_jit_kernel_cuda_reduce.cu similarity index 97% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_reduce.cu rename to GraphBLAS/CUDA/reduce/template/GB_jit_kernel_cuda_reduce.cu index 708d96d197..47f15bc4bc 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_reduce.cu +++ b/GraphBLAS/CUDA/reduce/template/GB_jit_kernel_cuda_reduce.cu @@ -35,9 +35,9 @@ #define tile_sz 32 #define log2_tile_sz 5 -#include "GB_cuda_tile_sum_uint64.cuh" -#include "GB_cuda_tile_reduce_ztype.cuh" -#include "GB_cuda_threadblock_reduce_ztype.cuh" +#include "template/GB_cuda_tile_sum_uint64.cuh" +#include "template/GB_cuda_tile_reduce_ztype.cuh" +#include "template/GB_cuda_threadblock_reduce_ztype.cuh" //------------------------------------------------------------------------------ // GB_cuda_reduce_kernel: reduce all entries in a matrix to a single scalar diff --git a/GraphBLAS/CUDA/GB_cuda_select.hpp b/GraphBLAS/CUDA/select/GB_cuda_select.hpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_select.hpp rename to GraphBLAS/CUDA/select/GB_cuda_select.hpp diff --git a/GraphBLAS/CUDA/GB_cuda_select_bitmap.cpp b/GraphBLAS/CUDA/select/GB_cuda_select_bitmap.cpp similarity index 96% rename from GraphBLAS/CUDA/GB_cuda_select_bitmap.cpp rename to GraphBLAS/CUDA/select/GB_cuda_select_bitmap.cpp index bbeb938616..751f6847e5 100644 --- a/GraphBLAS/CUDA/GB_cuda_select_bitmap.cpp +++ b/GraphBLAS/CUDA/select/GB_cuda_select_bitmap.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_select.hpp" +#include "select/GB_cuda_select.hpp" #undef GB_FREE_ALL #define GB_FREE_ALL \ diff --git a/GraphBLAS/CUDA/GB_cuda_select_bitmap_jit.cpp b/GraphBLAS/CUDA/select/GB_cuda_select_bitmap_jit.cpp similarity index 97% rename from GraphBLAS/CUDA/GB_cuda_select_bitmap_jit.cpp rename to GraphBLAS/CUDA/select/GB_cuda_select_bitmap_jit.cpp index c644e264ba..fba572d63c 100644 --- a/GraphBLAS/CUDA/GB_cuda_select_bitmap_jit.cpp +++ b/GraphBLAS/CUDA/select/GB_cuda_select_bitmap_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_select.hpp" +#include "select/GB_cuda_select.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/GB_cuda_select_branch.cpp b/GraphBLAS/CUDA/select/GB_cuda_select_branch.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_select_branch.cpp rename to GraphBLAS/CUDA/select/GB_cuda_select_branch.cpp diff --git a/GraphBLAS/CUDA/GB_cuda_select_sparse.cpp b/GraphBLAS/CUDA/select/GB_cuda_select_sparse.cpp similarity index 98% rename from GraphBLAS/CUDA/GB_cuda_select_sparse.cpp rename to GraphBLAS/CUDA/select/GB_cuda_select_sparse.cpp index 51d60d615f..95d36512d9 100644 --- a/GraphBLAS/CUDA/GB_cuda_select_sparse.cpp +++ b/GraphBLAS/CUDA/select/GB_cuda_select_sparse.cpp @@ -1,5 +1,5 @@ -#include "GB_cuda_select.hpp" +#include "select/GB_cuda_select.hpp" #undef GB_FREE_ALL #define GB_FREE_ALL \ diff --git a/GraphBLAS/CUDA/GB_cuda_select_sparse_jit.cpp b/GraphBLAS/CUDA/select/GB_cuda_select_sparse_jit.cpp similarity index 97% rename from GraphBLAS/CUDA/GB_cuda_select_sparse_jit.cpp rename to GraphBLAS/CUDA/select/GB_cuda_select_sparse_jit.cpp index ff7a4b7f64..b54c905a89 100644 --- a/GraphBLAS/CUDA/GB_cuda_select_sparse_jit.cpp +++ b/GraphBLAS/CUDA/select/GB_cuda_select_sparse_jit.cpp @@ -1,4 +1,4 @@ -#include "GB_cuda_select.hpp" +#include "select/GB_cuda_select.hpp" extern "C" { diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu b/GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_bitmap.cu similarity index 94% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu rename to GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_bitmap.cu index f9b9d3929d..d71ef23867 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu +++ b/GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_bitmap.cu @@ -5,9 +5,8 @@ using namespace cooperative_groups ; #define tile_sz 32 #define log2_tile_sz 5 -#include "GB_cuda_atomics.cuh" -#include "GB_cuda_tile_sum_uint64.cuh" -#include "GB_cuda_threadblock_sum_uint64.cuh" +#include "template/GB_cuda_tile_sum_uint64.cuh" +#include "template/GB_cuda_threadblock_sum_uint64.cuh" __global__ void GB_cuda_select_bitmap_kernel ( diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu b/GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_sparse.cu similarity index 99% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu rename to GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_sparse.cu index 8422345299..814c746052 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu +++ b/GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_sparse.cu @@ -47,7 +47,9 @@ using namespace cooperative_groups ; -#include "GB_cuda_ek_slice.cuh" +#include "template/GB_cuda_ek_slice.cuh" + +// FIXME: put the following elsewhere, say GB_cuda_kernel.cuh: #include #ifdef TIMING #include "omp.h" diff --git a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_sparse_OLD.cu b/GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_sparse_OLD.cu similarity index 99% rename from GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_sparse_OLD.cu rename to GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_sparse_OLD.cu index d8f64fa94c..d1043f39c4 100644 --- a/GraphBLAS/CUDA/template/GB_jit_kernel_cuda_select_sparse_OLD.cu +++ b/GraphBLAS/CUDA/select/template/GB_jit_kernel_cuda_select_sparse_OLD.cu @@ -1,7 +1,7 @@ using namespace cooperative_groups ; -#include "GB_cuda_ek_slice.cuh" -#include "GB_cuda_cumsum.cuh" +#include "template/GB_cuda_ek_slice.cuh" +#include "template/GB_cuda_cumsum.cuh" #define GB_FREE_WORKSPACE \ { \ diff --git a/GraphBLAS/CUDA/template/GB_cuda_ek_slice.cuh b/GraphBLAS/CUDA/slice/template/GB_cuda_ek_slice.cuh similarity index 100% rename from GraphBLAS/CUDA/template/GB_cuda_ek_slice.cuh rename to GraphBLAS/CUDA/slice/template/GB_cuda_ek_slice.cuh diff --git a/GraphBLAS/CUDA/GB_cuda_type_branch.cpp b/GraphBLAS/CUDA/type/GB_cuda_type_branch.cpp similarity index 100% rename from GraphBLAS/CUDA/GB_cuda_type_branch.cpp rename to GraphBLAS/CUDA/type/GB_cuda_type_branch.cpp diff --git a/GraphBLAS/Config/GraphBLASConfig.cmake.in b/GraphBLAS/Config/GraphBLASConfig.cmake.in index 0a50fd7826..71b3587282 100644 --- a/GraphBLAS/Config/GraphBLASConfig.cmake.in +++ b/GraphBLAS/Config/GraphBLASConfig.cmake.in @@ -40,27 +40,6 @@ set ( GRAPHBLAS_VERSION "@GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@Gr include ( CMakeFindDependencyMacro ) set ( _dependencies_found ON ) -if ( @GRAPHBLAS_HAS_CUDA@ ) - # Look for imported targets of additional dependency if GraphBLAS was built with CUDA - - if ( NOT GraphBLAS_CUDA_FOUND ) - if ( @SUITESPARSE_IN_BUILD_TREE@ ) - # First check in a common build tree - find_dependency ( GraphBLAS_CUDA @GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@GraphBLAS_VERSION_SUB@ - PATHS ${CMAKE_SOURCE_DIR}/../GraphBLAS/build/CUDA NO_DEFAULT_PATH ) - # Then, check in the currently active CMAKE_MODULE_PATH - if ( NOT GraphBLAS_CUDA_FOUND ) - find_dependency ( GraphBLAS_CUDA @GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@GraphBLAS_VERSION_SUB@ ) - endif ( ) - else ( ) - find_dependency ( GraphBLAS_CUDA @GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@GraphBLAS_VERSION_SUB@ ) - endif ( ) - endif ( ) - if ( NOT GraphBLAS_CUDA_FOUND ) - set ( _dependencies_found OFF ) - endif ( ) -endif ( ) - # Look for OpenMP if ( @GRAPHBLAS_HAS_OPENMP@ AND NOT OpenMP_C_FOUND ) find_dependency ( OpenMP COMPONENTS C ) @@ -255,3 +234,4 @@ if ( GRAPHBLAS_STATIC ) set_target_properties ( SuiteSparse::GraphBLAS_static PROPERTIES OUTPUT_NAME ${_graphblas_library_name} ) endif ( ) + diff --git a/GraphBLAS/Doc/ChangeLog b/GraphBLAS/Doc/ChangeLog index 7531e71fe1..4abd225652 100644 --- a/GraphBLAS/Doc/ChangeLog +++ b/GraphBLAS/Doc/ChangeLog @@ -1,3 +1,20 @@ +Jan 21, 2026, version 10.3.1 + + * (67) bug fix: incorrect JIT kernel constructed for R=masker(C,M,Z) + when R is hypersparse. Found by Roi Lipman, FalkorDB. + * (66) bug fix: GB_bitonic did not compile with the MS cl compiler. + Found by Erik Welch, NVIDIA. + +Dec 3, 2025, version 10.3.0 + + * GrB_extract: performance improvement for C=A(I,J) when A is very large + and (typically) hypersparse, and I is an explicit list and also large. + * GrB_(Matrix,Vector,Scalar)_dup: pending work is now left unfinished; + any pending work in the input matrix will appear in the copy. In + v10.2.0 and earlier, all pending work in the input matrix was finished + first. To obtain the behavior in v10.2.0 and earlier, simply call + GrB_*_wait on the input matrix before calling GrB_*_dup. + Nov 1, 2025, version 10.2.0 * printing of user-defined types: get/set with GxB_PRINT_FUNCTION diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf index b1cfd5e886..14bc183482 100644 Binary files a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf and b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf differ diff --git a/GraphBLAS/Doc/GraphBLAS_version.tex b/GraphBLAS/Doc/GraphBLAS_version.tex index ad8d227add..267f6f81f7 100644 --- a/GraphBLAS/Doc/GraphBLAS_version.tex +++ b/GraphBLAS/Doc/GraphBLAS_version.tex @@ -1,5 +1,5 @@ % version of SuiteSparse:GraphBLAS \date{VERSION -10.2.0, -Nov 1, 2025} +10.3.1, +Jan 21, 2026} diff --git a/GraphBLAS/Doc/UserGuide/GrB_objects_Matrix.tex b/GraphBLAS/Doc/UserGuide/GrB_objects_Matrix.tex index a25aaf8f29..567832b762 100644 --- a/GraphBLAS/Doc/UserGuide/GrB_objects_Matrix.tex +++ b/GraphBLAS/Doc/UserGuide/GrB_objects_Matrix.tex @@ -189,6 +189,12 @@ \subsubsection{{\sf GrB\_Matrix\_dup:} copy a matrix} no effect on the other. The \verb'GrB_NAME' is copied into the new matrix. +In GraphBLAS v10.2.0 and earlier, a call to \verb'GrB_Matrix_dup' triggered +a internal call to \verb'GrB_Matrix_wait' on the input matrix \verb'A'. In +v10.3.0 and later, this call is removed. To obtain the old behavior of +v10.2.0 and earlier, simply call \verb'GrB_Matrix_wait' on the input matrix +\verb'A' prior to calling \verb'GrB_Matrix_dup(&C,A)'. + %------------------------------------------------------------------------------- \subsubsection{{\sf GrB\_Matrix\_clear:} clear a matrix of all entries} %------------------------------------------------------------------------------- diff --git a/GraphBLAS/Doc/UserGuide/GrB_objects_Scalar.tex b/GraphBLAS/Doc/UserGuide/GrB_objects_Scalar.tex index 127b6cf5c6..dfc78b8847 100644 --- a/GraphBLAS/Doc/UserGuide/GrB_objects_Scalar.tex +++ b/GraphBLAS/Doc/UserGuide/GrB_objects_Scalar.tex @@ -121,6 +121,13 @@ \subsubsection{{\sf GrB\_Scalar\_dup:} copy a scalar} effect on the other. The \verb'GrB_NAME' is copied into the new scalar. +In GraphBLAS v10.2.0 and earlier, a call to \verb'GrB_Scalar_dup' triggered +a internal call to \verb'GrB_Scalar_wait' on the input scalar \verb't'. In +v10.3.0 and later, this call is removed. To obtain the old behavior of +v10.2.0 and earlier, simply call \verb'GrB_Scalar_wait' on the input scalar +\verb't' prior to calling \verb'GrB_Scalar_dup(&s,t)'. + +\newpage %------------------------------------------------------------------------------- \subsubsection{{\sf GrB\_Scalar\_clear:} clear a scalar of its entry} %------------------------------------------------------------------------------- diff --git a/GraphBLAS/Doc/UserGuide/GrB_objects_Vector.tex b/GraphBLAS/Doc/UserGuide/GrB_objects_Vector.tex index 43268191ed..5cf4c17ebd 100644 --- a/GraphBLAS/Doc/UserGuide/GrB_objects_Vector.tex +++ b/GraphBLAS/Doc/UserGuide/GrB_objects_Vector.tex @@ -156,6 +156,13 @@ \subsubsection{{\sf GrB\_Vector\_dup:} copy a vector} no effect on the other. The \verb'GrB_NAME' is copied into the new vector. +In GraphBLAS v10.2.0 and earlier, a call to \verb'GrB_Vector_dup' triggered +a internal call to \verb'GrB_Vector_wait' on the input vector \verb'u'. In +v10.3.0 and later, this call is removed. To obtain the old behavior of +v10.2.0 and earlier, simply call \verb'GrB_Vector_wait' on the input vector +\verb'u' prior to calling \verb'GrB_Vector_dup(&w,u)'. + +\newpage %------------------------------------------------------------------------------- \subsubsection{{\sf GrB\_Vector\_clear:} clear a vector of all entries} %------------------------------------------------------------------------------- @@ -178,7 +185,6 @@ \subsubsection{{\sf GrB\_Vector\_clear:} clear a vector of all entries} with \verb'v (:) = sparse(0)' in MATLAB. The type and dimension of \verb'v' do not change. Any pending updates to the vector are discarded. -\newpage %------------------------------------------------------------------------------- \subsubsection{{\sf GrB\_Vector\_size:} return the size of a vector} %------------------------------------------------------------------------------- @@ -219,6 +225,7 @@ \subsubsection{{\sf GrB\_Vector\_nvals:} return the number of entries in GraphBLAS need not be zero and \verb'nnz' (short for ``number of nonzeros'') in MATLAB is better described as ``number of entries'' in GraphBLAS. +\newpage %------------------------------------------------------------------------------- \subsubsection{{\sf GrB\_Vector\_build:} build a vector from a set of tuples} %------------------------------------------------------------------------------- @@ -364,7 +371,6 @@ \subsubsection{{\sf GrB\_Vector\_setElement:} add an entry to a vector} \verb'GrB_Matrix_setElement' in Section~\ref{matrix_setElement}. If an error occurs, \verb'GrB_error(&err,w)' returns details about the error. -\newpage %------------------------------------------------------------------------------- \subsubsection{{\sf GrB\_Vector\_extractElement:} get an entry from a vector} %------------------------------------------------------------------------------- diff --git a/GraphBLAS/Doc/UserGuide/GrB_release.tex b/GraphBLAS/Doc/UserGuide/GrB_release.tex index 5344152f01..7dcd6c0a1e 100644 --- a/GraphBLAS/Doc/UserGuide/GrB_release.tex +++ b/GraphBLAS/Doc/UserGuide/GrB_release.tex @@ -5,12 +5,37 @@ \section{Release Notes} \begin{itemize} -\item Nov 1, version 10.2.0 +\item Jan 21, 2026: version 10.3.1 \begin{itemize} - \item printing of user-defined types: get/set with \verb'GxB_PRINT_FUNCTION'. + \item (67) bug fix: incorrect JIT kernel constructed for R=masker(C,M,Z) + when R is hypersparse. Found by Roi Lipman, FalkorDB. + \item (66) bug fix: \verb'GB_bitonic' did not compile with the MS cl + compiler. Found by Erik Welch, NVIDIA. + \end{itemize} + +\item Dec 3, 2025: version 10.3.0 + + \begin{itemize} + \item \verb'GrB_extract': performance improvement for \verb'C=A(I,J)' when + \verb'A' is very large and (typically) hypersparse, and \verb'I' is an + explicit list and also large. + \item \verb'GrB_(Matrix,Vector,Scalar)_dup': pending work is now left + unfinished; any pending work in the input matrix will appear in the + copy. In v10.2.0 and earlier, all pending work in the input matrix was + finished first. To obtain the behavior in v10.2.0 and earlier, simply + call \verb'GrB_*_wait' on the input matrix before calling + \verb'GrB_*_dup'. + \end{itemize} + +\item Nov 1, 2025: version 10.2.0 + + \begin{itemize} + \item printing of user-defined types: get/set with + \verb'GxB_PRINT_FUNCTION'. \item doc: documentation of \verb'GrB_Matrix_import/export' was backwards. - \item performance: new \verb'GrB_assign' kernel for \verb'C+=A' added. + \item performance: new \verb'GrB_assign' kernel for \verb'C+=A' + added. \end{itemize} \item July 25, 2025: version 10.1.1 diff --git a/GraphBLAS/GraphBLAS/@GrB/GrB.m b/GraphBLAS/GraphBLAS/@GrB/GrB.m index d737ed1745..5f13c443c7 100644 --- a/GraphBLAS/GraphBLAS/@GrB/GrB.m +++ b/GraphBLAS/GraphBLAS/@GrB/GrB.m @@ -363,7 +363,7 @@ % operations: % C = GrB.build (I,J,X,m,n,dup,type,desc) build a GrB matrix from % list of entries (like C=sparse(I,J,X...)) -% [C,I,J] = GrB.compact (A,id) remove empty rows and columns +% [C,I,J] = GrB.compact (A,id,s) remove empty rows and columns % c = GrB.entries (A,...) count or query entries in a matrix % C = GrB.expand (scalar, A) expand a scalar (C = scalar*spones(A)) % [I,J,X] = GrB.extracttuples (A,desc) extract all entries (like 'find') @@ -996,7 +996,7 @@ C = cell2mat (A) ; c = chunk (c) ; clear ; - [C, I, J] = compact (A, id) ; + [C, I, J] = compact (A, id, symmetric) ; descriptorinfo (d) ; C = deserialize (blob, mode, arg3) ; % arg3 for testing only Y = dnn (W, bias, Y0) ; % uses GrB matrices diff --git a/GraphBLAS/GraphBLAS/@GrB/compact.m b/GraphBLAS/GraphBLAS/@GrB/compact.m index 566c72597d..6dee801658 100644 --- a/GraphBLAS/GraphBLAS/@GrB/compact.m +++ b/GraphBLAS/GraphBLAS/@GrB/compact.m @@ -1,4 +1,4 @@ -function [C, I, J] = compact (A, id) +function [C, I, J] = compact (A, id, symmetric) %GRB.COMPACT remove empty rows and columns from a matrix. % C = GrB.compact (A) returns rows and columns from A that have no entries. % It has no effect on a full matrix, except to convert it to a @@ -11,12 +11,19 @@ % % To remove rows and colums that either have no entries, or that only have % entries equal to a particular scalar value, use C = GrB.compact (A, id), -% where id is the scalar value. +% where id is the scalar value. To skip this option, use id = [ ]. % % With two additional output arguments, [C,I,J] = GrB.compact (A, ...), % the indices of non-empty rows and columns of A are returned, so that % C = A (I,J). The lists I and J are returned in sorted order. % +% A third parameter specifies if C should be constructed symmetrically. +% C = GrB.compact (A, [ ], 'symmetric') constructs a list I = union (I,J), +% and then extracts C = A (I,I). To use this option, A must be square, but +% it need not be symmetric. This option is useful for pruning nodes of +% a directed graph A that have no incoming or outgoing edges. The graph of +% C will include any node with either incoming or outgoing (or both) edges. +% % Example: % % n = 2^40 ; @@ -41,7 +48,15 @@ A = A.opaque ; end -if (nargin > 1) +symmetric = (nargin > 2 && isequal (symmetric, 'symmetric')) ; +if (symmetric) + [m n] = gbsize (A) ; + if (m ~= n) + error ('A must be square to use the "symmetric" option') ; + end +end + +if (nargin > 1 && ~isempty (id)) % prune identity values from A id = gb_get_scalar (id) ; if (id ~= 0) @@ -57,6 +72,11 @@ I = gb_entries (A, 'row', 'list') ; J = gb_entries (A, 'col', 'list') ; +if (symmetric) + I = union (I, J) ; + J = I ; +end + % C = A (I,J) C = GrB (gbextract (A, { I }, { J })) ; diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c index b38d1919eb..480662928a 100644 --- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c +++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c @@ -298,6 +298,7 @@ void mexFunction OK (GrB_Scalar_new (&Zero, atype)) ; OK (GrB_Scalar_setElement_INT32 (Zero, 0)) ; b = (GrB_Matrix) Zero ; + Zero = NULL ; } //-------------------------------------------------------------------------- diff --git a/GraphBLAS/GraphBLAS/rename/GB_rename.h b/GraphBLAS/GraphBLAS/rename/GB_rename.h index 8f57768a04..5bd4246d12 100644 --- a/GraphBLAS/GraphBLAS/rename/GB_rename.h +++ b/GraphBLAS/GraphBLAS/rename/GB_rename.h @@ -155,6 +155,7 @@ #define GB_bitmap_expand_to_hyper GM_bitmap_expand_to_hyper #define GB_bitmap_M_scatter_whole GM_bitmap_M_scatter_whole #define GB_bitmap_subref GM_bitmap_subref +#define GB_bitonic GM_bitonic #define GB_bitset_int16 GM_bitset_int16 #define GB_bitset_int32 GM_bitset_int32 #define GB_bitset_int64 GM_bitset_int64 diff --git a/GraphBLAS/GraphBLAS/test/gbtest54.m b/GraphBLAS/GraphBLAS/test/gbtest54.m index df12bfbf3f..ec3f74e081 100644 --- a/GraphBLAS/GraphBLAS/test/gbtest54.m +++ b/GraphBLAS/GraphBLAS/test/gbtest54.m @@ -16,6 +16,17 @@ [C, I, J] = GrB.compact (H, 0) ; assert (isequal (C, A (:,2:end))) ; +A = sprand (n, n, 0.02) ; +[C, I, J] = GrB.compact (A, [ ], 'symmetric') ; +assert (isequal (I, J)) ; +C2 = A (I, I) ; +assert (isequal (C, C2)) ; + +[C, I, J] = GrB.compact (A, [ ]) ; +assert (~isequal (I, J)) ; +C2 = A (I, J) ; +assert (isequal (C, C2)) ; + A = ones (4) ; A (1,1) = 2 ; G = GrB.compact (A, 2) ; @@ -25,5 +36,15 @@ A = sparse (A) ; assert (isequal (G, A)) ; +A = sprand (n, n/2, 0.5) ; +try + [C, I, J] = GrB.compact (A, [ ], 'symmetric') ; + ok = 0 ; +catch expected_error + expected_error + ok = 1 ; +end +assert (ok) ; + fprintf ('gbtest54: all tests passed\n') ; diff --git a/GraphBLAS/Include/GraphBLAS.h b/GraphBLAS/Include/GraphBLAS.h index 7efe28239b..86ab614231 100644 --- a/GraphBLAS/Include/GraphBLAS.h +++ b/GraphBLAS/Include/GraphBLAS.h @@ -1,4 +1,4 @@ -// SuiteSparse:GraphBLAS 10.2.0 +// SuiteSparse:GraphBLAS 10.3.1 //------------------------------------------------------------------------------ // GraphBLAS.h: definitions for the GraphBLAS package //------------------------------------------------------------------------------ @@ -286,10 +286,10 @@ // The version of this implementation, and the GraphBLAS API version: #define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS" -#define GxB_IMPLEMENTATION_DATE "Nov 1, 2025" +#define GxB_IMPLEMENTATION_DATE "Jan 21, 2026" #define GxB_IMPLEMENTATION_MAJOR 10 -#define GxB_IMPLEMENTATION_MINOR 2 -#define GxB_IMPLEMENTATION_SUB 0 +#define GxB_IMPLEMENTATION_MINOR 3 +#define GxB_IMPLEMENTATION_SUB 1 #define GxB_SPEC_DATE "Dec 22, 2023" #define GxB_SPEC_MAJOR 2 #define GxB_SPEC_MINOR 1 diff --git a/GraphBLAS/JITpackage/CMakeLists.txt b/GraphBLAS/JITpackage/CMakeLists.txt index 64492a1620..6f975d7f53 100644 --- a/GraphBLAS/JITpackage/CMakeLists.txt +++ b/GraphBLAS/JITpackage/CMakeLists.txt @@ -76,7 +76,8 @@ if ( TARGET grb_jitpackage ) file ( GLOB GRB_SOURCE_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../Include/GraphBLAS.h" - "../CUDA/template/*" + "../CUDA/*/template/*" + "../CUDA/*/include/*" "../CUDA/include/*" "../Source/*/template/*" "../Source/*/include/*" diff --git a/GraphBLAS/README.md b/GraphBLAS/README.md index 6f36bdfb08..1d416dd328 100644 --- a/GraphBLAS/README.md +++ b/GraphBLAS/README.md @@ -4,7 +4,7 @@ SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved. SPDX-License-Identifier: Apache-2.0 -VERSION 10.2.0, Nov 1, 2025 +VERSION 10.3.1, Jan 21, 2026 SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS standard, which defines a set of sparse matrix operations on an extended algebra of diff --git a/GraphBLAS/Source/builder/GB_build.c b/GraphBLAS/Source/builder/GB_build.c index f609e28b1b..a17cc66d39 100644 --- a/GraphBLAS/Source/builder/GB_build.c +++ b/GraphBLAS/Source/builder/GB_build.c @@ -336,8 +336,8 @@ GrB_Info GB_build // build matrix // created an iso-valued matrix T, but this is not yet known. X_iso is // false for these methods. Since it has not yet been conformed to its // final sparsity structure, the matrix T is hypersparse, not bitmap. It - // has no zombies or pending tuples, so GB_all_entries_are_iso does need to - // handle those cases. T->x [0] is the new iso value of T. + // has no zombies or pending tuples, so GB_all_entries_are_iso does not + // need to handle those cases. T->x [0] is the new iso value of T. if (!X_iso && GB_all_entries_are_iso (T)) { diff --git a/GraphBLAS/Source/builder/GB_builder.c b/GraphBLAS/Source/builder/GB_builder.c index f01a4142c4..7ad2532bb1 100644 --- a/GraphBLAS/Source/builder/GB_builder.c +++ b/GraphBLAS/Source/builder/GB_builder.c @@ -22,8 +22,8 @@ // The work is done in major 5 Steps, some of which can be skipped, depending // on how the tuples are provided (*_work or *_input), and whether or not they // are sorted, or have duplicates. If vdim <= 1, some work is skipped (for -// GrB_Vectors, and single-vector GrB_Matrices). Let e be the of tuples on -// input. Let p be the # of threads used. +// GrB_Vectors, and single-vector GrB_Matrices). Let e be the number of tuples +// on input. Let p be the number of threads used. // STEP 1: copy user input. O(e/p) read/write per thread, or skipped. @@ -40,47 +40,58 @@ // STEP 5: assemble the tuples. O(e/p) read/writes per thread, or O(1) if the // values can be transplanted into T as-is. -// For GrB_Matrix_build: If the input (I_input, J_input, S_input) is already -// sorted with no duplicates, and no typecasting needs to be done, then Step 1 -// still must be done (each thread does O(e/p) reads of (I_input,J_input) and -// writes to I_work), but Step 1 also does the work for Step 3. Step 2 and 3 -// are skipped. Step 4 does O(e/p) reads per thread (J_input only). Then -// I_work is transplanted into T->i. Step 5 does O(e/p) read/writes per thread -// to copy Sx into T->x. +// For GrB_Matrix_build and GrB_Matrix_import: The (I_work, J_work, S_work) +// inputs are not used, and are NULL. If the input (I_input, J_input, +// S_input) is already sorted with no duplicates, and no typecasting needs +// to be done, then Step 1 still must be done (each thread does O(e/p) +// reads of (I_input,J_input) and writes to I_work), but Step 1 also does +// the work for Step 3. Step 2 and 3 are skipped. Step 4 does O(e/p) +// reads per thread (J_input only). Then I_work is transplanted into +// T->i. Step 5 does O(e/p) read/writes per thread to copy Sx into T->x. +// After the matrix is built, GB_all_entries_are_iso is called to +// determine if the matrix is iso. // For GrB_Vector_build: as GrB_Matrix_build, Step 1 does O(e/p) read/writes -// per thread. The input is always a vector, so vdim == 1 always holds. Step -// 2 is skipped if the indices are already sorted, and Step 3 does no work at -// all unless duplicates appear. Step 4 takes no time, for any vector. Step 5 -// does O(e/p) reads/writes per thread. +// per thread. The input is always a vector, so vdim == 1 always holds, +// and J_input is NULL. Step 2 is skipped if the indices are already +// sorted, and Step 3 does no work at all unless duplicates appear. Step +// 4 takes no time, for any vector. Step 5 does O(e/p) reads/writes per +// thread. After the vector is built, GB_all_entries_are_iso is called to +// determine if the vector is iso. // For GB_wait: the pending tuples are provided as I_work, J_work, and S_work, -// so Step 1 is skipped (no need to check for invalid indices). The input -// J_work may be null (vdim can be anything, since GB_wait is used for both -// vectors and matrices). The tuples might be in sorted order already, which -// is known precisely known from A->Pending->sorted. Step 2 does -// O((e log e)/p) work to sort the tuples. Duplicates may appear, and -// out-of-order tuples are likely. Step 3 does O(e/p) read/writes. Step 4 -// does O(e/p) reads per thread of (I_work,J_work), or just I_work. Step 5 -// does O(e/p) read/writes per thread, or O(1) time if S_work can be -// transplanted into T->x. +// so Step 1 is skipped (no need to check for invalid indices). The input +// J_work may be NULL (vdim can be anything, since GB_wait is used for +// both vectors and matrices). The tuples might be in sorted order +// already, which is known precisely known from A->Pending->sorted. Step +// 2 does O((e log e)/p) work to sort the tuples. Duplicates may appear, +// and out-of-order tuples are likely. Step 3 does O(e/p) read/writes. +// Step 4 does O(e/p) reads per thread of (I_work,J_work), or just I_work. +// Step 5 does O(e/p) read/writes per thread, or O(1) time if S_work can +// be transplanted into T->x. // For GB_transpose: uses I_work, J_work, and either S_input (if no op applied -// to the values) or S_work (if an op was applied to the A->x values). This is -// only done for matrices, not vectors, so vdim > 1 will always hold. The -// indices are valid so Step 1 is skipped. The tuples are not sorted, so Step -// 2 takes O((e log e)/p) time to do the sort. There are no duplicates, so -// Step 3 only does O(e/p) reads of J_work to count the vectors in each slice. -// Step 4 only does O(e/p) reads of J_work to compute T->h and T->p. Step 5 -// does O(e/p) read/writes per thread, but it uses the simpler case in -// GB_bld_template since no duplicates can appear. It is unlikely -// able to transplant S_work into T->x since the input will almost always be -// unsorted. - -// For GB_concat_hyper: uses I_work, J_work, and S_work. No duplicates -// appear. Tuples are not sorted on input. I_work is transplanted into C->i. -// J_work and S_work are freed on output. S_work is not transplanted into -// C->x. +// to the values) or S_work (if an op was applied to the A->x values). +// This is only done for matrices, not vectors, so vdim > 1 will always +// hold. The indices are valid so Step 1 is skipped. The tuples are not +// sorted, so Step 2 takes O((e log e)/p) time to do the sort. There are +// no duplicates, so Step 3 only does O(e/p) reads of J_work to count the +// vectors in each slice. Step 4 only does O(e/p) reads of J_work to +// compute T->h and T->p. Step 5 does O(e/p) read/writes per thread, but +// it uses the simpler case in GB_bld_template since no duplicates can +// appear. It is unlikely able to transplant S_work into T->x since the +// input will almost always be unsorted. + +// For GB_concat_hyper: uses I_work, J_work, and S_work (or S_input is a +// scalar if C is iso). No duplicates appear. Tuples are not sorted on +// input. I_work is transplanted into C->i. J_work and S_work are freed +// on output. S_work is not transplanted into C->x. + +// For GB_hyper_hash_build: uses I_work, J_work, and S_work. No duplicates +// appear. Tuples are not sorted on input. S_iso is false. + +// For GB_reshape: uses I_work, J_work, and S_work. No duplicates appear. +// Tuples can be sorted or unsorted on input. // For iso inputs/outputs: T and Sx have the same iso property. If they are // iso, then dup is always NULL. Duplicates may or may not appear if T and Sx diff --git a/GraphBLAS/Source/builtin/include/GB_opaque.h b/GraphBLAS/Source/builtin/include/GB_opaque.h index e3e0d86d7f..aa0f3ff43b 100644 --- a/GraphBLAS/Source/builtin/include/GB_opaque.h +++ b/GraphBLAS/Source/builtin/include/GB_opaque.h @@ -659,117 +659,87 @@ struct GB_Matrix_opaque // content of GrB_Matrix // for declaring pointers for specific matrices (C, M, A, B, S, R, Z): // C matrix: - #define GB_Cp_DECLARE(Cp,const) GB_MDECL (Cp, const, u) - #define GB_Ch_DECLARE(Ch,const) GB_MDECL (Ch, const, u) - #define GB_Ci_DECLARE(Ci,const) GB_MDECL (Ci, const, ) - #define GB_Ci_DECLARE_U(Ci,const) GB_MDECL (Ci, const, u) - #define GB_CYp_DECLARE(C_Yp,const) GB_MDECL (C_Yp, const, u) - #define GB_CYi_DECLARE(C_Yi,const) GB_MDECL (C_Yi, const, u) - #define GB_CYx_DECLARE(C_Yx,const) GB_MDECL (C_Yx, const, u) + #define GB_Cp_DECLARE(Cp,const) GB_MDECL (Cp, const, u) + #define GB_Ch_DECLARE(Ch,const) GB_MDECL (Ch, const, u) + #define GB_Ci_DECLARE(Ci,const) GB_MDECL (Ci, const, ) + #define GB_Ci_DECLARE_U(Ci,const) GB_MDECL (Ci, const, u) #define GB_CPendingi_DECLARE(Pending_i) GB_MDECL (Pending_i, , u) #define GB_CPendingj_DECLARE(Pending_j) GB_MDECL (Pending_j, , u) // M matrix: - #define GB_Mp_DECLARE(Mp,const) GB_MDECL (Mp, const, u) - #define GB_Mh_DECLARE(Mh,const) GB_MDECL (Mh, const, u) - #define GB_Mi_DECLARE(Mi,const) GB_MDECL (Mi, const, ) - #define GB_Mi_DECLARE_U(Mi,const) GB_MDECL (Mi, const, u) - #define GB_MYp_DECLARE(M_Yp,const) GB_MDECL (M_Yp, const, u) - #define GB_MYi_DECLARE(M_Yi,const) GB_MDECL (M_Yi, const, u) - #define GB_MYx_DECLARE(M_Yx,const) GB_MDECL (M_Yx, const, u) + #define GB_Mp_DECLARE(Mp,const) GB_MDECL (Mp, const, u) + #define GB_Mh_DECLARE(Mh,const) GB_MDECL (Mh, const, u) + #define GB_Mi_DECLARE(Mi,const) GB_MDECL (Mi, const, ) + #define GB_Mi_DECLARE_U(Mi,const) GB_MDECL (Mi, const, u) // A matrix: - #define GB_Ap_DECLARE(Ap,const) GB_MDECL (Ap, const, u) - #define GB_Ah_DECLARE(Ah,const) GB_MDECL (Ah, const, u) - #define GB_Ai_DECLARE(Ai,const) GB_MDECL (Ai, const, ) - #define GB_Ai_DECLARE_U(Ai,const) GB_MDECL (Ai, const, u) - #define GB_AYp_DECLARE(A_Yp,const) GB_MDECL (A_Yp, const, u) - #define GB_AYi_DECLARE(A_Yi,const) GB_MDECL (A_Yi, const, u) - #define GB_AYx_DECLARE(A_Yx,const) GB_MDECL (A_Yx, const, u) + #define GB_Ap_DECLARE(Ap,const) GB_MDECL (Ap, const, u) + #define GB_Ah_DECLARE(Ah,const) GB_MDECL (Ah, const, u) + #define GB_Ai_DECLARE(Ai,const) GB_MDECL (Ai, const, ) + #define GB_Ai_DECLARE_U(Ai,const) GB_MDECL (Ai, const, u) // B matrix: - #define GB_Bp_DECLARE(Bp,const) GB_MDECL (Bp, const, u) - #define GB_Bh_DECLARE(Bh,const) GB_MDECL (Bh, const, u) - #define GB_Bi_DECLARE(Bi,const) GB_MDECL (Bi, const, ) - #define GB_Bi_DECLARE_U(Bi,const) GB_MDECL (Bi, const, u) - #define GB_BYp_DECLARE(B_Yp,const) GB_MDECL (B_Yp, const, u) - #define GB_BYi_DECLARE(B_Yi,const) GB_MDECL (B_Yi, const, u) - #define GB_BYx_DECLARE(B_Yx,const) GB_MDECL (B_Yx, const, u) + #define GB_Bp_DECLARE(Bp,const) GB_MDECL (Bp, const, u) + #define GB_Bh_DECLARE(Bh,const) GB_MDECL (Bh, const, u) + #define GB_Bi_DECLARE(Bi,const) GB_MDECL (Bi, const, ) + #define GB_Bi_DECLARE_U(Bi,const) GB_MDECL (Bi, const, u) // S matrix: - #define GB_Sp_DECLARE(Sp,const) GB_MDECL (Sp, const, u) - #define GB_Sh_DECLARE(Sh,const) GB_MDECL (Sh, const, u) - #define GB_Si_DECLARE(Si,const) GB_MDECL (Si, const, ) - #define GB_Si_DECLARE_U(Si,const) GB_MDECL (Si, const, u) - #define GB_SYp_DECLARE(S_Yp,const) GB_MDECL (S_Yp, const, u) - #define GB_SYi_DECLARE(S_Yi,const) GB_MDECL (S_Yi, const, u) - #define GB_SYx_DECLARE(S_Yx,const) GB_MDECL (S_Yx, const, u) + #define GB_Sp_DECLARE(Sp,const) GB_MDECL (Sp, const, u) + #define GB_Sh_DECLARE(Sh,const) GB_MDECL (Sh, const, u) + #define GB_Si_DECLARE(Si,const) GB_MDECL (Si, const, ) + #define GB_Si_DECLARE_U(Si,const) GB_MDECL (Si, const, u) // R matrix: - #define GB_Rp_DECLARE(Rp,const) GB_MDECL (Rp, const, u) - #define GB_Rh_DECLARE(Rh,const) GB_MDECL (Rh, const, u) - #define GB_Ri_DECLARE(Ri,const) GB_MDECL (Ri, const, ) - #define GB_Ri_DECLARE_U(Ri,const) GB_MDECL (Ri, const, u) + #define GB_Rp_DECLARE(Rp,const) GB_MDECL (Rp, const, u) + #define GB_Rh_DECLARE(Rh,const) GB_MDECL (Rh, const, u) + #define GB_Ri_DECLARE(Ri,const) GB_MDECL (Ri, const, ) + #define GB_Ri_DECLARE_U(Ri,const) GB_MDECL (Ri, const, u) // Z matrix: - #define GB_Zp_DECLARE(Zp,const) GB_MDECL (Zp, const, u) - #define GB_Zh_DECLARE(Zh,const) GB_MDECL (Zh, const, u) - #define GB_Zi_DECLARE(Zi,const) GB_MDECL (Zi, const, ) - #define GB_Zi_DECLARE_U(Zi,const) GB_MDECL (Zi, const, u) + #define GB_Zp_DECLARE(Zp,const) GB_MDECL (Zp, const, u) + #define GB_Zh_DECLARE(Zh,const) GB_MDECL (Zh, const, u) + #define GB_Zi_DECLARE(Zi,const) GB_MDECL (Zi, const, ) + #define GB_Zi_DECLARE_U(Zi,const) GB_MDECL (Zi, const, u) // for getting pointers from specific matrices: // C matrix: - #define GB_Cp_PTR(Cp,C) GB_GET_MATRIX_PTR (Cp, C, p_is_32, p) - #define GB_Ch_PTR(Ch,C) GB_GET_MATRIX_PTR (Ch, C, j_is_32, h) - #define GB_Ci_PTR(Ci,C) GB_GET_MATRIX_PTR (Ci, C, i_is_32, i) - #define GB_CYp_PTR(C_Yp,C) GB_GET_HYPER_PTR (C_Yp, C, p) - #define GB_CYi_PTR(C_Yi,C) GB_GET_HYPER_PTR (C_Yi, C, i) - #define GB_CYx_PTR(C_Yx,C) GB_GET_HYPER_PTR (C_Yx, C, x) + #define GB_Cp_PTR(Cp,C) GB_GET_MATRIX_PTR (Cp, C, p_is_32, p) + #define GB_Ch_PTR(Ch,C) GB_GET_MATRIX_PTR (Ch, C, j_is_32, h) + #define GB_Ci_PTR(Ci,C) GB_GET_MATRIX_PTR (Ci, C, i_is_32, i) #define GB_CPendingi_PTR(Pending_i,C) GB_GET_PENDINGi_PTR (Pending_i, C) #define GB_CPendingj_PTR(Pending_j,C) GB_GET_PENDINGj_PTR (Pending_j, C) // M matrix: - #define GB_Mp_PTR(Mp,M) GB_GET_MATRIX_PTR (Mp, M, p_is_32, p) - #define GB_Mh_PTR(Mh,M) GB_GET_MATRIX_PTR (Mh, M, j_is_32, h) - #define GB_Mi_PTR(Mi,M) GB_GET_MATRIX_PTR (Mi, M, i_is_32, i) - #define GB_MYp_PTR(M_Yp,M) GB_GET_HYPER_PTR (M_Yp, M, p) - #define GB_MYi_PTR(M_Yi,M) GB_GET_HYPER_PTR (M_Yi, M, i) - #define GB_MYx_PTR(M_Yx,M) GB_GET_HYPER_PTR (M_Yx, M, x) + #define GB_Mp_PTR(Mp,M) GB_GET_MATRIX_PTR (Mp, M, p_is_32, p) + #define GB_Mh_PTR(Mh,M) GB_GET_MATRIX_PTR (Mh, M, j_is_32, h) + #define GB_Mi_PTR(Mi,M) GB_GET_MATRIX_PTR (Mi, M, i_is_32, i) // A matrix: - #define GB_Ap_PTR(Ap,A) GB_GET_MATRIX_PTR (Ap, A, p_is_32, p) - #define GB_Ah_PTR(Ah,A) GB_GET_MATRIX_PTR (Ah, A, j_is_32, h) - #define GB_Ai_PTR(Ai,A) GB_GET_MATRIX_PTR (Ai, A, i_is_32, i) - #define GB_AYp_PTR(A_Yp,A) GB_GET_HYPER_PTR (A_Yp, A, p) - #define GB_AYi_PTR(A_Yi,A) GB_GET_HYPER_PTR (A_Yi, A, i) - #define GB_AYx_PTR(A_Yx,A) GB_GET_HYPER_PTR (A_Yx, A, x) + #define GB_Ap_PTR(Ap,A) GB_GET_MATRIX_PTR (Ap, A, p_is_32, p) + #define GB_Ah_PTR(Ah,A) GB_GET_MATRIX_PTR (Ah, A, j_is_32, h) + #define GB_Ai_PTR(Ai,A) GB_GET_MATRIX_PTR (Ai, A, i_is_32, i) // B matrix: - #define GB_Bp_PTR(Bp,B) GB_GET_MATRIX_PTR (Bp, B, p_is_32, p) - #define GB_Bh_PTR(Bh,B) GB_GET_MATRIX_PTR (Bh, B, j_is_32, h) - #define GB_Bi_PTR(Bi,B) GB_GET_MATRIX_PTR (Bi, B, i_is_32, i) - #define GB_BYp_PTR(B_Yp,B) GB_GET_HYPER_PTR (B_Yp, B, p) - #define GB_BYi_PTR(B_Yi,B) GB_GET_HYPER_PTR (B_Yi, B, i) - #define GB_BYx_PTR(B_Yx,B) GB_GET_HYPER_PTR (B_Yx, B, x) + #define GB_Bp_PTR(Bp,B) GB_GET_MATRIX_PTR (Bp, B, p_is_32, p) + #define GB_Bh_PTR(Bh,B) GB_GET_MATRIX_PTR (Bh, B, j_is_32, h) + #define GB_Bi_PTR(Bi,B) GB_GET_MATRIX_PTR (Bi, B, i_is_32, i) // S matrix: - #define GB_Sp_PTR(Sp,S) GB_GET_MATRIX_PTR (Sp, S, p_is_32, p) - #define GB_Sh_PTR(Sh,S) GB_GET_MATRIX_PTR (Sh, S, j_is_32, h) - #define GB_Si_PTR(Si,S) GB_GET_MATRIX_PTR (Si, S, i_is_32, i) - #define GB_SYp_PTR(S_Yp,S) GB_GET_HYPER_PTR (S_Yp, S, p) - #define GB_SYi_PTR(S_Yi,S) GB_GET_HYPER_PTR (S_Yi, S, i) - #define GB_SYx_PTR(S_Yx,S) GB_GET_HYPER_PTR (S_Yx, S, x) + #define GB_Sp_PTR(Sp,S) GB_GET_MATRIX_PTR (Sp, S, p_is_32, p) + #define GB_Sh_PTR(Sh,S) GB_GET_MATRIX_PTR (Sh, S, j_is_32, h) + #define GB_Si_PTR(Si,S) GB_GET_MATRIX_PTR (Si, S, i_is_32, i) // R matrix: - #define GB_Rp_PTR(Rp,R) GB_GET_MATRIX_PTR (Rp, R, p_is_32, p) - #define GB_Rh_PTR(Rh,R) GB_GET_MATRIX_PTR (Rh, R, j_is_32, h) - #define GB_Ri_PTR(Ri,R) GB_GET_MATRIX_PTR (Ri, R, i_is_32, i) + #define GB_Rp_PTR(Rp,R) GB_GET_MATRIX_PTR (Rp, R, p_is_32, p) + #define GB_Rh_PTR(Rh,R) GB_GET_MATRIX_PTR (Rh, R, j_is_32, h) + #define GB_Ri_PTR(Ri,R) GB_GET_MATRIX_PTR (Ri, R, i_is_32, i) // Z matrix: - #define GB_Zp_PTR(Zp,Z) GB_GET_MATRIX_PTR (Zp, Z, p_is_32, p) - #define GB_Zh_PTR(Zh,Z) GB_GET_MATRIX_PTR (Zh, Z, j_is_32, h) - #define GB_Zi_PTR(Zi,Z) GB_GET_MATRIX_PTR (Zi, Z, i_is_32, i) + #define GB_Zp_PTR(Zp,Z) GB_GET_MATRIX_PTR (Zp, Z, p_is_32, p) + #define GB_Zh_PTR(Zh,Z) GB_GET_MATRIX_PTR (Zh, Z, j_is_32, h) + #define GB_Zi_PTR(Zi,Z) GB_GET_MATRIX_PTR (Zi, Z, i_is_32, i) // for getting entries from Ap, Ah, Ai for specific matrices: @@ -867,13 +837,10 @@ struct GB_Matrix_opaque // content of GrB_Matrix // for declaring pointers for specific matrices: // C matrix: - #define GB_Cp_DECLARE(Cp,const) GB_JDECL (Cp, const, u, GB_Cp_BITS) - #define GB_Ch_DECLARE(Ch,const) GB_JDECL (Ch, const, u, GB_Cj_BITS) - #define GB_Ci_DECLARE(Ci,const) GB_JDECL (Ci, const, , GB_Ci_BITS) - #define GB_Ci_DECLARE_U(Ci,const) GB_JDECL (Ci, const, u, GB_Ci_BITS) - #define GB_CYp_DECLARE(C_Yp,const) GB_JDECL (C_Yp,const, u, GB_Cj_BITS) - #define GB_CYi_DECLARE(C_Yi,const) GB_JDECL (C_Yi,const, u, GB_Cj_BITS) - #define GB_CYx_DECLARE(C_Yx,const) GB_JDECL (C_Yx,const, u, GB_Cj_BITS) + #define GB_Cp_DECLARE(Cp,const) GB_JDECL (Cp, const, u, GB_Cp_BITS) + #define GB_Ch_DECLARE(Ch,const) GB_JDECL (Ch, const, u, GB_Cj_BITS) + #define GB_Ci_DECLARE(Ci,const) GB_JDECL (Ci, const, , GB_Ci_BITS) + #define GB_Ci_DECLARE_U(Ci,const) GB_JDECL (Ci, const, u, GB_Ci_BITS) #define GB_CPendingi_DECLARE(Pending_i) \ GB_JDECL (Pending_i, , u, GB_Ci_BITS) #define GB_CPendingj_DECLARE(Pending_j) \ @@ -883,67 +850,55 @@ struct GB_Matrix_opaque // content of GrB_Matrix #define GB_Ci_IS_32 (GB_Ci_BITS == 32) // M matrix: - #define GB_Mp_DECLARE(Mp,const) GB_JDECL (Mp, const, u, GB_Mp_BITS) - #define GB_Mh_DECLARE(Mh,const) GB_JDECL (Mh, const, u, GB_Mj_BITS) - #define GB_Mi_DECLARE(Mi,const) GB_JDECL (Mi, const, , GB_Mi_BITS) - #define GB_Mi_DECLARE_U(Mi,const) GB_JDECL (Mi, const, u, GB_Mi_BITS) - #define GB_MYp_DECLARE(M_Yp,const) GB_JDECL (M_Yp,const, u, GB_Mj_BITS) - #define GB_MYi_DECLARE(M_Yi,const) GB_JDECL (M_Yi,const, u, GB_Mj_BITS) - #define GB_MYx_DECLARE(M_Yx,const) GB_JDECL (M_Yx,const, u, GB_Mj_BITS) + #define GB_Mp_DECLARE(Mp,const) GB_JDECL (Mp, const, u, GB_Mp_BITS) + #define GB_Mh_DECLARE(Mh,const) GB_JDECL (Mh, const, u, GB_Mj_BITS) + #define GB_Mi_DECLARE(Mi,const) GB_JDECL (Mi, const, , GB_Mi_BITS) + #define GB_Mi_DECLARE_U(Mi,const) GB_JDECL (Mi, const, u, GB_Mi_BITS) #define GB_Mp_IS_32 (GB_Mp_BITS == 32) #define GB_Mj_IS_32 (GB_Mj_BITS == 32) #define GB_Mi_IS_32 (GB_Mi_BITS == 32) // A matrix: - #define GB_Ap_DECLARE(Ap,const) GB_JDECL (Ap, const, u, GB_Ap_BITS) - #define GB_Ah_DECLARE(Ah,const) GB_JDECL (Ah, const, u, GB_Aj_BITS) - #define GB_Ai_DECLARE(Ai,const) GB_JDECL (Ai, const, , GB_Ai_BITS) - #define GB_Ai_DECLARE_U(Ai,const) GB_JDECL (Ai, const, u, GB_Ai_BITS) - #define GB_AYp_DECLARE(A_Yp,const) GB_JDECL (A_Yp,const, u, GB_Aj_BITS) - #define GB_AYi_DECLARE(A_Yi,const) GB_JDECL (A_Yi,const, u, GB_Aj_BITS) - #define GB_AYx_DECLARE(A_Yx,const) GB_JDECL (A_Yx,const, u, GB_Aj_BITS) + #define GB_Ap_DECLARE(Ap,const) GB_JDECL (Ap, const, u, GB_Ap_BITS) + #define GB_Ah_DECLARE(Ah,const) GB_JDECL (Ah, const, u, GB_Aj_BITS) + #define GB_Ai_DECLARE(Ai,const) GB_JDECL (Ai, const, , GB_Ai_BITS) + #define GB_Ai_DECLARE_U(Ai,const) GB_JDECL (Ai, const, u, GB_Ai_BITS) #define GB_Ap_IS_32 (GB_Ap_BITS == 32) #define GB_Aj_IS_32 (GB_Aj_BITS == 32) #define GB_Ai_IS_32 (GB_Ai_BITS == 32) // B matrix: - #define GB_Bp_DECLARE(Bp,const) GB_JDECL (Bp, const, u, GB_Bp_BITS) - #define GB_Bh_DECLARE(Bh,const) GB_JDECL (Bh, const, u, GB_Bj_BITS) - #define GB_Bi_DECLARE(Bi,const) GB_JDECL (Bi, const, , GB_Bi_BITS) - #define GB_Bi_DECLARE_U(Bi,const) GB_JDECL (Bi, const, u, GB_Bi_BITS) - #define GB_BYp_DECLARE(B_Yp,const) GB_JDECL (B_Yp,const, u, GB_Bj_BITS) - #define GB_BYi_DECLARE(B_Yi,const) GB_JDECL (B_Yi,const, u, GB_Bj_BITS) - #define GB_BYx_DECLARE(B_Yx,const) GB_JDECL (B_Yx,const, u, GB_Bj_BITS) + #define GB_Bp_DECLARE(Bp,const) GB_JDECL (Bp, const, u, GB_Bp_BITS) + #define GB_Bh_DECLARE(Bh,const) GB_JDECL (Bh, const, u, GB_Bj_BITS) + #define GB_Bi_DECLARE(Bi,const) GB_JDECL (Bi, const, , GB_Bi_BITS) + #define GB_Bi_DECLARE_U(Bi,const) GB_JDECL (Bi, const, u, GB_Bi_BITS) #define GB_Bp_IS_32 (GB_Bp_BITS == 32) #define GB_Bj_IS_32 (GB_Bj_BITS == 32) #define GB_Bi_IS_32 (GB_Bi_BITS == 32) // S matrix: - #define GB_Sp_DECLARE(Sp,const) GB_JDECL (Sp, const, u, GB_Sp_BITS) - #define GB_Sh_DECLARE(Sh,const) GB_JDECL (Sh, const, u, GB_Sj_BITS) - #define GB_Si_DECLARE(Si,const) GB_JDECL (Si, const, , GB_Si_BITS) - #define GB_Si_DECLARE_U(Si,const) GB_JDECL (Si, const, u, GB_Si_BITS) - #define GB_SYp_DECLARE(S_Yp,const) GB_JDECL (S_Yp,const, u, GB_Sj_BITS) - #define GB_SYi_DECLARE(S_Yi,const) GB_JDECL (S_Yi,const, u, GB_Sj_BITS) - #define GB_SYx_DECLARE(S_Yx,const) GB_JDECL (S_Yx,const, u, GB_Sj_BITS) + #define GB_Sp_DECLARE(Sp,const) GB_JDECL (Sp, const, u, GB_Sp_BITS) + #define GB_Sh_DECLARE(Sh,const) GB_JDECL (Sh, const, u, GB_Sj_BITS) + #define GB_Si_DECLARE(Si,const) GB_JDECL (Si, const, , GB_Si_BITS) + #define GB_Si_DECLARE_U(Si,const) GB_JDECL (Si, const, u, GB_Si_BITS) #define GB_Sp_IS_32 (GB_Sp_BITS == 32) #define GB_Sj_IS_32 (GB_Sj_BITS == 32) #define GB_Si_IS_32 (GB_Si_BITS == 32) // R matrix: - #define GB_Rp_DECLARE(Rp,const) GB_JDECL (Rp, const, u, GB_Rp_BITS) - #define GB_Rh_DECLARE(Rh,const) GB_JDECL (Rh, const, u, GB_Rj_BITS) - #define GB_Ri_DECLARE(Ri,const) GB_JDECL (Ri, const, , GB_Ri_BITS) - #define GB_Ri_DECLARE_U(Ri,const) GB_JDECL (Ri, const, u, GB_Ri_BITS) + #define GB_Rp_DECLARE(Rp,const) GB_JDECL (Rp, const, u, GB_Rp_BITS) + #define GB_Rh_DECLARE(Rh,const) GB_JDECL (Rh, const, u, GB_Rj_BITS) + #define GB_Ri_DECLARE(Ri,const) GB_JDECL (Ri, const, , GB_Ri_BITS) + #define GB_Ri_DECLARE_U(Ri,const) GB_JDECL (Ri, const, u, GB_Ri_BITS) #define GB_Rp_IS_32 (GB_Rp_BITS == 32) #define GB_Rj_IS_32 (GB_Rj_BITS == 32) #define GB_Ri_IS_32 (GB_Ri_BITS == 32) // Z matrix: - #define GB_Zp_DECLARE(Zp,const) GB_JDECL (Zp, const, u, GB_Zp_BITS) - #define GB_Zh_DECLARE(Zh,const) GB_JDECL (Zh, const, u, GB_Zj_BITS) - #define GB_Zi_DECLARE(Zi,const) GB_JDECL (Zi, const, , GB_Zi_BITS) - #define GB_Zi_DECLARE_U(Zi,const) GB_JDECL (Zi, const, u, GB_Zi_BITS) + #define GB_Zp_DECLARE(Zp,const) GB_JDECL (Zp, const, u, GB_Zp_BITS) + #define GB_Zh_DECLARE(Zh,const) GB_JDECL (Zh, const, u, GB_Zj_BITS) + #define GB_Zi_DECLARE(Zi,const) GB_JDECL (Zi, const, , GB_Zi_BITS) + #define GB_Zi_DECLARE_U(Zi,const) GB_JDECL (Zi, const, u, GB_Zi_BITS) #define GB_Zp_IS_32 (GB_Zp_BITS == 32) #define GB_Zj_IS_32 (GB_Zj_BITS == 32) #define GB_Zi_IS_32 (GB_Zi_BITS == 32) @@ -951,60 +906,46 @@ struct GB_Matrix_opaque // content of GrB_Matrix // for getting pointers from specific matrices: // C matrix: - #define GB_Cp_PTR(Cp,C) GB_GET_MATRIX_PTR (Cp, C, p) - #define GB_Ch_PTR(Ch,C) GB_GET_MATRIX_PTR (Ch, C, h) - #define GB_Ci_PTR(Ci,C) GB_GET_MATRIX_PTR (Ci, C, i) - #define GB_CYp_PTR(C_Yp,C) GB_GET_HYPER_PTR (C_Yp, C, p) - #define GB_CYi_PTR(C_Yi,C) GB_GET_HYPER_PTR (C_Yi, C, i) - #define GB_CYx_PTR(C_Yx,C) GB_GET_HYPER_PTR (C_Yx, C, x) + #define GB_Cp_PTR(Cp,C) GB_GET_MATRIX_PTR (Cp, C, p) + #define GB_Ch_PTR(Ch,C) GB_GET_MATRIX_PTR (Ch, C, h) + #define GB_Ci_PTR(Ci,C) GB_GET_MATRIX_PTR (Ci, C, i) #define GB_CPendingi_PTR(Pending_i,C) Pending_i = C->Pending->i #define GB_CPendingj_PTR(Pending_j,C) Pending_j = C->Pending->j // M matrix: - #define GB_Mp_PTR(Mp,M) GB_GET_MATRIX_PTR (Mp, M, p) - #define GB_Mh_PTR(Mh,M) GB_GET_MATRIX_PTR (Mh, M, h) - #define GB_Mi_PTR(Mi,M) GB_GET_MATRIX_PTR (Mi, M, i) - #define GB_MYp_PTR(M_Yp,M) GB_GET_HYPER_PTR (M_Yp, M, p) - #define GB_MYi_PTR(M_Yi,M) GB_GET_HYPER_PTR (M_Yi, M, i) - #define GB_MYx_PTR(M_Yx,M) GB_GET_HYPER_PTR (M_Yx, M, x) + #define GB_Mp_PTR(Mp,M) GB_GET_MATRIX_PTR (Mp, M, p) + #define GB_Mh_PTR(Mh,M) GB_GET_MATRIX_PTR (Mh, M, h) + #define GB_Mi_PTR(Mi,M) GB_GET_MATRIX_PTR (Mi, M, i) // A matrix: - #define GB_Ap_PTR(Ap,A) GB_GET_MATRIX_PTR (Ap, A, p) - #define GB_Ah_PTR(Ah,A) GB_GET_MATRIX_PTR (Ah, A, h) - #define GB_Ai_PTR(Ai,A) GB_GET_MATRIX_PTR (Ai, A, i) - #define GB_AYp_PTR(A_Yp,A) GB_GET_HYPER_PTR (A_Yp, A, p) - #define GB_AYi_PTR(A_Yi,A) GB_GET_HYPER_PTR (A_Yi, A, i) - #define GB_AYx_PTR(A_Yx,A) GB_GET_HYPER_PTR (A_Yx, A, x) + #define GB_Ap_PTR(Ap,A) GB_GET_MATRIX_PTR (Ap, A, p) + #define GB_Ah_PTR(Ah,A) GB_GET_MATRIX_PTR (Ah, A, h) + #define GB_Ai_PTR(Ai,A) GB_GET_MATRIX_PTR (Ai, A, i) // B matrix: - #define GB_Bp_PTR(Bp,B) GB_GET_MATRIX_PTR (Bp, B, p) - #define GB_Bh_PTR(Bh,B) GB_GET_MATRIX_PTR (Bh, B, h) - #define GB_Bi_PTR(Bi,B) GB_GET_MATRIX_PTR (Bi, B, i) - #define GB_BYp_PTR(B_Yp,B) GB_GET_HYPER_PTR (B_Yp, B, p) - #define GB_BYi_PTR(B_Yi,B) GB_GET_HYPER_PTR (B_Yi, B, i) - #define GB_BYx_PTR(B_Yx,B) GB_GET_HYPER_PTR (B_Yx, B, x) + #define GB_Bp_PTR(Bp,B) GB_GET_MATRIX_PTR (Bp, B, p) + #define GB_Bh_PTR(Bh,B) GB_GET_MATRIX_PTR (Bh, B, h) + #define GB_Bi_PTR(Bi,B) GB_GET_MATRIX_PTR (Bi, B, i) // S matrix: - #define GB_Sp_PTR(Sp,S) GB_GET_MATRIX_PTR (Sp, S, p) - #define GB_Sh_PTR(Sh,S) GB_GET_MATRIX_PTR (Sh, S, h) - #define GB_Si_PTR(Si,S) GB_GET_MATRIX_PTR (Si, S, i) - #define GB_SYp_PTR(S_Yp,S) GB_GET_HYPER_PTR (S_Yp, S, p) - #define GB_SYi_PTR(S_Yi,S) GB_GET_HYPER_PTR (S_Yi, S, i) - #define GB_SYx_PTR(S_Yx,S) GB_GET_HYPER_PTR (S_Yx, S, x) + #define GB_Sp_PTR(Sp,S) GB_GET_MATRIX_PTR (Sp, S, p) + #define GB_Sh_PTR(Sh,S) GB_GET_MATRIX_PTR (Sh, S, h) + #define GB_Si_PTR(Si,S) GB_GET_MATRIX_PTR (Si, S, i) // R matrix: - #define GB_Rp_PTR(Rp,R) GB_GET_MATRIX_PTR (Rp, R, p) - #define GB_Rh_PTR(Rh,R) GB_GET_MATRIX_PTR (Rh, R, h) - #define GB_Ri_PTR(Ri,R) GB_GET_MATRIX_PTR (Ri, R, i) + #define GB_Rp_PTR(Rp,R) GB_GET_MATRIX_PTR (Rp, R, p) + #define GB_Rh_PTR(Rh,R) GB_GET_MATRIX_PTR (Rh, R, h) + #define GB_Ri_PTR(Ri,R) GB_GET_MATRIX_PTR (Ri, R, i) // Z matrix: - #define GB_Zp_PTR(Zp,Z) GB_GET_MATRIX_PTR (Zp, Z, p) - #define GB_Zh_PTR(Zh,Z) GB_GET_MATRIX_PTR (Zh, Z, h) - #define GB_Zi_PTR(Zi,Z) GB_GET_MATRIX_PTR (Zi, Z, i) + #define GB_Zp_PTR(Zp,Z) GB_GET_MATRIX_PTR (Zp, Z, p) + #define GB_Zh_PTR(Zh,Z) GB_GET_MATRIX_PTR (Zh, Z, h) + #define GB_Zi_PTR(Zi,Z) GB_GET_MATRIX_PTR (Zi, Z, i) // for getting entries from Ap, Ah, Ai for specific matrices: - // These must be #define'd in each JIT kernel, via GB_macrofy_* + // These must be #define'd in each JIT kernel, via GB_macrofy_sparsity + // and GB_macrofy_nvals. #endif diff --git a/GraphBLAS/Source/dup/GB_dup.c b/GraphBLAS/Source/dup/GB_dup.c index 912c1fef15..67ede5f634 100644 --- a/GraphBLAS/Source/dup/GB_dup.c +++ b/GraphBLAS/Source/dup/GB_dup.c @@ -12,18 +12,17 @@ // if numeric is false, C->x is allocated but not initialized. +// Pending work in A is copied into C; it is not finished. + // There is little use for the following feature, but (*Chandle) and A might be // identical, with GrB_dup (&A, A). The input matrix A will be lost, and will // result in a memory leak, unless the user application does the following // (which is valid and memory-leak free): -// B = A ; - -// GrB_dup (&A, A) ; - -// GrB_free (&A) ; - -// GrB_free (&B) ; +// B = A ; +// GrB_dup (&A, A) ; +// GrB_free (&A) ; +// GrB_free (&B) ; // A is the new copy and B is the old copy. Each should be freed when done. @@ -47,12 +46,6 @@ GrB_Info GB_dup // make an exact copy of a matrix ASSERT_MATRIX_OK (A, "A to duplicate", GB0) ; (*Chandle) = NULL ; - //-------------------------------------------------------------------------- - // delete any lingering zombies and assemble any pending tuples - //-------------------------------------------------------------------------- - - GB_MATRIX_WAIT (A) ; - //-------------------------------------------------------------------------- // C = A //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/dup/GB_dup_worker.c b/GraphBLAS/Source/dup/GB_dup_worker.c index cd004af000..8fba2d4995 100644 --- a/GraphBLAS/Source/dup/GB_dup_worker.c +++ b/GraphBLAS/Source/dup/GB_dup_worker.c @@ -14,8 +14,13 @@ // If *Chandle is not NULL on input, the header is reused. It may be a static // or dynamic header, depending on C->header_size. +// The input matrix A can include any pending work (pending tuples, zombies, +// or jumbled). The pending work is copied into the output matrix C. It is +// not finished. This case is only supported if numeric is true. + #include "GB.h" #include "get_set/GB_get_set.h" +#include "pending/GB_Pending.h" #define GB_FREE_ALL \ GB_FREE_MEMORY (&C_user_name, C_user_name_size) ; @@ -38,7 +43,7 @@ GrB_Info GB_dup_worker // make an exact copy of a matrix GrB_Info info ; ASSERT_MATRIX_OK (A, "A to duplicate", GB0) ; ASSERT (Chandle != NULL) ; - ASSERT (!GB_PENDING (A)) ; + ASSERT (GB_PENDING_OK (A)) ; ASSERT (GB_JUMBLED_OK (A)) ; ASSERT (GB_ZOMBIES_OK (A)) ; @@ -49,9 +54,12 @@ GrB_Info GB_dup_worker // make an exact copy of a matrix int nthreads_max = GB_Context_nthreads_max ( ) ; //-------------------------------------------------------------------------- - // get A + // get A and C //-------------------------------------------------------------------------- + GrB_Matrix C = (*Chandle) ; + bool preexisting_header = (C != NULL) ; + int64_t anz = GB_nnz_held (A) ; int64_t anvec = A->nvec ; int64_t anvals = A->nvals ; @@ -60,6 +68,7 @@ GrB_Info GB_dup_worker // make an exact copy of a matrix bool A_jumbled = A->jumbled ; int sparsity_control = A->sparsity_control ; GrB_Type atype = A->type ; + GB_Pending A_Pending = A->Pending ; //-------------------------------------------------------------------------- // copy the user_name of A, if present @@ -87,19 +96,40 @@ GrB_Info GB_dup_worker // make an exact copy of a matrix // allocate a new user header for C if (*Chandle) is NULL, or reuse the // existing static or dynamic header if (*Chandle) is not NULL. - GrB_Matrix C = (*Chandle) ; GB_OK (GB_new_bix (Chandle, // can be new or existing header numeric ? atype : ctype, A->vlen, A->vdim, GB_ph_malloc, A->is_csc, GB_sparsity (A), false, A->hyper_switch, A->plen, anz, true, C_iso, A->p_is_32, A->j_is_32, A->i_is_32)) ; C = (*Chandle) ; + //-------------------------------------------------------------------------- + // allocate the pending tuples, if present + //-------------------------------------------------------------------------- + + if (A_Pending != NULL && numeric) + { + // A has pending tuples; allocate space for them in C. This case is + // only supported if numeric is true. + ASSERT (C_iso == A->iso) ; + if (!GB_Pending_alloc (C, A->iso, A_Pending->type, A_Pending->op, + A_Pending->nmax)) + { + // out of memory + GB_FREE_ALL ; + GB_phybix_free (C) ; + if (!preexisting_header) + { + GB_Matrix_free (Chandle) ; + } + return (GrB_OUT_OF_MEMORY) ; + } + } + //-------------------------------------------------------------------------- // copy the contents of A into C //-------------------------------------------------------------------------- C->nvec = anvec ; -// C->nvec_nonempty = anvec_nonempty ; GB_nvec_nonempty_set (C, anvec_nonempty) ; C->nvals = anvals ; C->jumbled = A_jumbled ; // C is jumbled if A is jumbled @@ -133,7 +163,36 @@ GrB_Info GB_dup_worker // make an exact copy of a matrix GB_memcpy (C->x, A->x, (A->iso ? 1:anz) * atype->size, nthreads_max) ; } - C->magic = GB_MAGIC ; // C->p and C->h are now initialized + //-------------------------------------------------------------------------- + // copy the pending tuples + //-------------------------------------------------------------------------- + + if (A_Pending != NULL && numeric) + { + GB_Pending C_Pending = C->Pending ; + int64_t n = A_Pending->n ; + bool is_matrix = (A->vdim > 1) ; + size_t jsize = (A->j_is_32) ? sizeof (uint32_t) : sizeof (uint64_t) ; + size_t isize = (A->i_is_32) ? sizeof (uint32_t) : sizeof (uint64_t) ; + size_t xsize = A_Pending->size ; + GB_memcpy (C_Pending->i, A_Pending->i, n * isize, nthreads_max) ; + if (is_matrix) + { + GB_memcpy (C_Pending->j, A_Pending->j, n * jsize, nthreads_max) ; + } + if (!A->iso) + { + GB_memcpy (C_Pending->x, A_Pending->x, n * xsize, nthreads_max) ; + } + C_Pending->n = n ; + C_Pending->sorted = A_Pending->sorted ; + } + + //-------------------------------------------------------------------------- + // C->p and C->h are now initialized + //-------------------------------------------------------------------------- + + C->magic = GB_MAGIC ; //-------------------------------------------------------------------------- // copy the user_name of A into C, if present @@ -141,6 +200,8 @@ GrB_Info GB_dup_worker // make an exact copy of a matrix C->user_name = C_user_name ; C->user_name_size = C_user_name_size ; + C_user_name = NULL ; + C_user_name_size = 0 ; //-------------------------------------------------------------------------- // return the result diff --git a/GraphBLAS/Source/extract/GB_I_inverse.c b/GraphBLAS/Source/extract/GB_I_inverse.c index 6a54f62732..107bce68ae 100644 --- a/GraphBLAS/Source/extract/GB_I_inverse.c +++ b/GraphBLAS/Source/extract/GB_I_inverse.c @@ -1,5 +1,5 @@ //------------------------------------------------------------------------------ -// GB_I_inverse: invert an index list +// GB_I_inverse: invert an index list, by constructing R = inverse (I) //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved. @@ -8,16 +8,25 @@ //------------------------------------------------------------------------------ // I is a large list relative to the vector length, avlen, and it is not -// contiguous. Scatter I into the I inverse buckets (Ihead and Inext) for quick -// lookup. +// contiguous. Construct the matrix R to hold the inverse of the I list for +// quick lookup. If i = I [k1], i = I [k2], and i = I [k3], then row i of the +// R matrix holds entries in columns k1, k2, and k3. R is iso-valued and held +// by row. If R has enough entries, it is converted to sparse. Otherwise, the +// hyper_hash R->Y is constructed to enable fast lookup of R(i,:). + +#define GB_FREE_WORKSPACE \ +{ \ + GB_FREE_MEMORY (&W, W_size) ; \ +} -// FUTURE:: this code is sequential. Constructing the I inverse buckets in -// parallel would require synchronization (a critical section for each bucket, -// or atomics). A more parallel approach might use qsort first, to find -// duplicates in I, and then construct the buckets in parallel after the qsort. -// But the time complexity would be higher. +#define GB_FREE_ALL \ +{ \ + GB_FREE_WORKSPACE ; \ + GB_Matrix_free (&R) ; \ +} #include "extract/GB_subref.h" +#include "builder/GB_build.h" GrB_Info GB_I_inverse // invert the I list for C=A(I,:) ( @@ -26,12 +35,7 @@ GrB_Info GB_I_inverse // invert the I list for C=A(I,:) int64_t nI, // length of I int64_t avlen, // length of the vectors of A // outputs: - void **p_Ihead, // head pointers for buckets, size avlen - size_t *p_Ihead_size, - void **p_Inext, // next pointers for buckets, size nI - size_t *p_Inext_size, - bool *p_Ihead_is_32, // if true, Ihead and Inext are 32-bit; else 64 - int64_t *p_nduplicates, // number of duplicate entries in I + GrB_Matrix *R_handle, // R = inverse (I) GB_Werk Werk ) { @@ -40,104 +44,178 @@ GrB_Info GB_I_inverse // invert the I list for C=A(I,:) // get inputs //-------------------------------------------------------------------------- - GB_MDECL (Ihead, , u) ; size_t Ihead_size = 0 ; - GB_MDECL (Inext, , u) ; size_t Inext_size = 0 ; - int64_t nduplicates = 0 ; - - (*p_Ihead) = NULL ; (*p_Ihead_size) = 0 ; - (*p_Inext) = NULL ; (*p_Inext_size) = 0 ; - (*p_nduplicates) = 0 ; - + GrB_Info info = GrB_SUCCESS ; + GrB_Matrix R = NULL ; + GB_MDECL (W, , u) ; size_t W_size = 0 ; + (*R_handle) = NULL ; GB_IDECL (I, const, u) ; GB_IPTR (I, I_is_32) ; //-------------------------------------------------------------------------- - // allocate workspace + // construct R matrix to hold the inverse of I //-------------------------------------------------------------------------- - // Entries in Ihead and Inext range in value from 0 to nI. Entries equal - // to nI or larger are invalid indices, need to tag the end of each bucket. - // Thus - - bool Ihead_is_32 = (nI < UINT32_MAX) ; - size_t isize = (Ihead_is_32) ? sizeof (uint32_t) : sizeof (uint64_t) ; - - Ihead = GB_MALLOC_MEMORY (avlen, isize, &Ihead_size) ; - Inext = GB_MALLOC_MEMORY (nI, isize, &Inext_size) ; - if (Inext == NULL || Ihead == NULL) + int64_t rvdim = avlen ; + int64_t rvlen = nI ; + int64_t rnvals = nI ; + bool Rp_is_32, Rj_is_32, Ri_is_32 ; + GB_determine_pji_is_32 (&Rp_is_32, &Rj_is_32, &Ri_is_32, + GxB_HYPERSPARSE, rnvals, rvlen, rvdim, Werk) ; + + bool W_is_32 = (nI < INT32_MAX) ; + size_t wsize = (W_is_32) ? sizeof (uint32_t) : sizeof (uint64_t) ; + W = GB_MALLOC_MEMORY (nI, wsize, &W_size) ; + if (W == NULL) { // out of memory - GB_FREE_MEMORY (&Ihead, Ihead_size) ; - GB_FREE_MEMORY (&Inext, Inext_size) ; + GB_FREE_ALL ; return (GrB_OUT_OF_MEMORY) ; } - // set all entries of Ihead to UINT*_MAX (32-bit or 64-bit) - int nthreads_max = GB_Context_nthreads_max ( ) ; - GB_memset (Ihead, 0xFF, Ihead_size, nthreads_max) ; + GB_IPTR (W, W_is_32) ; + for (int64_t k = 0 ; k < nI ; k++) + { + // W [k] = k + GB_ISET (W, k, k) ; + } - GB_IPTR (Ihead, Ihead_is_32) ; - GB_IPTR (Inext, Ihead_is_32) ; + // create R: rvdim-by-rvlen (avlen-by-nI), held by row, iso-valued + GB_OK (GB_new (&R, // new dynamic header, do not allocate content + GrB_UINT64, rvlen, rvdim, GB_ph_null, false, GxB_HYPERSPARSE, -1, 0, + Rp_is_32, Rj_is_32, Ri_is_32)) ; + + uint64_t S_input [1] ; + S_input [0] = 1 ; + + void *no_I_work = NULL ; size_t I_work_size = 0 ; + void *no_J_work = NULL ; size_t J_work_size = 0 ; + GB_void *no_X_work = NULL ; size_t X_work_size = 0 ; + + GB_OK (GB_builder ( + // T + R, // matrix to build, R of size rvdim-by-rvlen + // ttype + GrB_UINT64, // type of R (iso-valued) + // vlen + rvlen, // length of each vector of R (= nI) + // vdim + rvdim, // number of vectors of R (= avlen) + // is_csc + false, // R is CSR + // I_work_handle and size + &no_I_work, &I_work_size, // I_work not used + // J_work_handle and size + &no_J_work, &J_work_size, // J_work not used + // X_work_handle and size + &no_X_work, &X_work_size, // X_work not used + // known_sorted + false, // tuples might not be sorted + // known_no_duplicates + true, // no duplicates are present (W is unique) + // isjlen + nI, // size of I and W arrays + // is_matrix + true, // R is a matrix + // I_input + W, // column indices are W [0..nI-1] = (0:nI-1) + // J_input + I, // row indices are in I [0..nI-1] + // S_input + S_input, // values of R (iso-valued) + // S_iso + true, // R is iso-valued + // nvals + rnvals, // # of tuples in R (= nI) + // dup operator + NULL, // no dup operator + // stype + GrB_UINT64, // type of S_input + // do_burble + true, // allow burble + Werk, + W_is_32, // true if W is 32-bit, false if 64 + I_is_32, // true if I is 32-bit, false if 64 + Rp_is_32, // true if R->p is built as 32-bit, false if 64 + Rj_is_32, // true if R->h is built as 32-bit, false if 64 + Ri_is_32 // true if R->i is built as 32-bit, false if 64 + )) ; + + // R is hypersparse; convert to sparse if possible + ASSERT (GB_IS_HYPERSPARSE (R)) ; + // if needed, the # of duplicates in I is (nI - R->nvec) + if (rvdim < 32 * R->nvec) + { + // R is rvdim-by-rvlen in hypersparse CSR format. Determine if it + // should be held in a sparse format instead of hypersparse. R takes + // O(rnvals) memory as hypersparse and O(rnvals+rvdim) as sparse. + // Switch R to sparse format if rvdim is small enough. + GB_OK (GB_convert_hyper_to_sparse (R, true)) ; + } + else + { + // Keep R as hypersparse, but build its R->Y hyper_hash matrix + GB_OK (GB_hyper_hash_build (R, Werk)) ; + } //-------------------------------------------------------------------------- - // scatter the I indices into buckets + // check result //-------------------------------------------------------------------------- - // At this point, Ihead [0..avlen-1] >= nI = UINT64_MAX. - - // O(nI) time; not parallel - for (int64_t inew = nI-1 ; inew >= 0 ; inew--) - { - int64_t i = GB_IGET (I, inew) ; - ASSERT (i >= 0 && i < avlen) ; - int64_t ihead = GB_IGET (Ihead, i) ; - if (ihead < nI) - { - // i has already been seen in the list I - nduplicates++ ; - } - GB_ISET (Ihead, i, inew) ; // Ihead [i] = inew ; - GB_ISET (Inext, inew, ihead) ; // Inext [inew] = ihead ; - } - - // indices in I are now in buckets. An index i might appear more than once - // in the list I. inew = Ihead [i] is the first position of i in I (i will - // be I [inew]), Ihead [i] is the head of a link list of all places where i - // appears in I. inew = Inext [inew] traverses this list, until inew is >= - // nI, which denotes the end of the bucket. - - // to traverse all entries in bucket i, do: - // GB_for_each_index_in_bucket (inew,i,nI,Ihead,Inext) { ... } - - #define GB_for_each_index_in_bucket(inew,i,nI,Ihead,Inext) \ - for (uint64_t inew = GB_IGET (Ihead, i) ; \ - inew < nI ; \ - inew = GB_IGET (Inext, inew)) - - // If Ihead [i] > nI, then the ith bucket is empty and i is not in I. - // Otherise, the first index in bucket i is Ihead [i]. - #ifdef GB_DEBUG - for (int64_t i = 0 ; i < avlen ; i++) { - GB_for_each_index_in_bucket (inew, i, nI, Ihead, Inext) + // this test can take a very long time if A is hypersparse and + // avlen is huge + bool R_is_hyper = GB_IS_HYPERSPARSE (R) ; + int64_t rnvec = R->nvec ; + void *Rp = R->p ; + void *Rh = R->h ; + void *Ri = R->i ; + GB_IDECL (Rp, const, u) ; GB_IPTR (Rp, Rp_is_32) ; + GB_IDECL (Rh, const, u) ; GB_IPTR (Rh, Rj_is_32) ; + GB_IDECL (Ri, const, u) ; GB_IPTR (Ri, Ri_is_32) ; + GrB_Matrix R_Y = R->Y ; + void *R_Yp = R_Y ? R_Y->p : NULL ; + void *R_Yi = R_Y ? R_Y->i : NULL ; + void *R_Yx = R_Y ? R_Y->x : NULL ; + int64_t R_hash_bits = R_Y ? (R_Y->vdim - 1) : 0 ; + for (int64_t i = 0 ; i < avlen ; i++) { - // inew is the new index in C, and i is the index in A. - // All entries in the ith bucket refer to the same row A(i,:), - // but with different indices C (inew,:) in C. - ASSERT (inew >= 0 && inew < nI) ; - ASSERT (i == GB_IGET (I, inew)) ; + // find R(i,:), which contains one column index inew for each + // position in I where i occurs (i == I [inew]) + int64_t pR, pR_end ; + if (R_is_hyper) + { + // R(i,:) is the kth vector in the hypersparse matrix R; + // find k so that i = Rh [k] using the R->Y hyper_hash, + // and set pR = Rp [k] and pR_end = Rp [k+1]. + GB_hyper_hash_lookup (Rp_is_32, Rj_is_32, + Rh, rnvec, Rp, R_Yp, R_Yi, R_Yx, R_hash_bits, + i, &pR, &pR_end) ; + } + else + { + // R(i,:) is the ith vector in the sparse matrix R + pR = GB_IGET (Rp, i) ; // pR = Rp [i] + pR_end = GB_IGET (Rp, i+1) ; // pR_end = Rp [i+1] + } + // for each entry in the row R(i,:) + for (int64_t p = pR ; p < pR_end ; p++) + { + // get R(i,inew); this is the index i = I [inew] + int64_t inew = GB_IGET (Ri, p) ; // inew = Ri [p] + ASSERT (inew >= 0 && inew < nI) ; + ASSERT (i == GB_IGET (I, inew)) ; + } } } #endif //-------------------------------------------------------------------------- - // return result + // free workspace and return result //-------------------------------------------------------------------------- - (*p_Ihead) = Ihead ; (*p_Ihead_size) = Ihead_size ; - (*p_Inext) = Inext ; (*p_Inext_size) = Inext_size ; - (*p_Ihead_is_32) = Ihead_is_32 ; - (*p_nduplicates) = nduplicates ; + GB_FREE_WORKSPACE ; + ASSERT_MATRIX_OK (R, "R = I_inverse matrix", GB2) ; + (*R_handle) = R ; return (GrB_SUCCESS) ; } diff --git a/GraphBLAS/Source/extract/GB_subref.c b/GraphBLAS/Source/extract/GB_subref.c index cb7e4f4e89..efcef41a1c 100644 --- a/GraphBLAS/Source/extract/GB_subref.c +++ b/GraphBLAS/Source/extract/GB_subref.c @@ -72,20 +72,19 @@ #define GB_FREE_WORKSPACE \ { \ - GB_FREE_MEMORY (&TaskList, TaskList_size) ; \ - GB_FREE_MEMORY (&Ap_start, Ap_start_size) ; \ - GB_FREE_MEMORY (&Ap_end, Ap_end_size) ; \ - GB_FREE_MEMORY (&Ihead, Ihead_size) ; \ - GB_FREE_MEMORY (&Inext, Inext_size) ; \ - GB_FREE_MEMORY (&Cwork, Cwork_size) ; \ + GB_FREE_MEMORY (&TaskList, TaskList_size) ; \ + GB_FREE_MEMORY (&Ap_start, Ap_start_size) ; \ + GB_FREE_MEMORY (&Ap_end, Ap_end_size) ; \ + GB_FREE_MEMORY (&Cwork, Cwork_size) ; \ + GB_Matrix_free (&R) ; \ } -#define GB_FREE_ALL \ -{ \ - GB_FREE_MEMORY (&Cp, Cp_size) ; \ - GB_FREE_MEMORY (&Ch, Ch_size) ; \ - GB_phybix_free (C) ; \ - GB_FREE_WORKSPACE ; \ +#define GB_FREE_ALL \ +{ \ + GB_FREE_MEMORY (&Cp, Cp_size) ; \ + GB_FREE_MEMORY (&Ch, Ch_size) ; \ + GB_phybix_free (C) ; \ + GB_FREE_WORKSPACE ; \ } #include "extract/GB_subref.h" @@ -185,13 +184,12 @@ GrB_Info GB_subref // C = A(I,J): either symbolic or numeric void *Ch = NULL ; size_t Ch_size = 0 ; void *Ap_start = NULL ; size_t Ap_start_size = 0 ; void *Ap_end = NULL ; size_t Ap_end_size = 0 ; - void *Ihead = NULL ; size_t Ihead_size = 0 ; - void *Inext = NULL ; size_t Inext_size = 0 ; uint64_t *Cwork = NULL ; size_t Cwork_size = 0 ; GB_task_struct *TaskList = NULL ; size_t TaskList_size = 0 ; - int64_t Cnvec = 0, nI = 0, nJ, Icolon [3], Cnvec_nonempty, ndupl ; - bool post_sort, need_qsort, Cp_is_32, Cj_is_32, Ci_is_32, Ihead_is_32 ; + int64_t Cnvec = 0, nI = 0, nJ, Icolon [3], Cnvec_nonempty ; + bool post_sort, need_qsort, Cp_is_32, Cj_is_32, Ci_is_32 ; int Ikind, ntasks, nthreads ; + GrB_Matrix R = NULL ; //-------------------------------------------------------------------------- // ensure A is unjumbled @@ -224,8 +222,7 @@ GrB_Info GB_subref // C = A(I,J): either symbolic or numeric GB_OK (GB_subref_slice ( // computed by phase1: &TaskList, &TaskList_size, &ntasks, &nthreads, &post_sort, - &Ihead, &Ihead_size, &Inext, &Inext_size, &Ihead_is_32, - &ndupl, &Cwork, &Cwork_size, + &R, &Cwork, &Cwork_size, // computed by phase0: Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon, // original input: @@ -239,8 +236,7 @@ GrB_Info GB_subref // C = A(I,J): either symbolic or numeric // computed by phase2: &Cp, &Cp_is_32, &Cp_size, &Cnvec_nonempty, // computed by phase1: - TaskList, ntasks, nthreads, Ihead, Inext, Ihead_is_32, - ndupl > 0, &Cwork, Cwork_size, + TaskList, ntasks, nthreads, R, &Cwork, Cwork_size, // computed by phase0: Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon, nJ, // original input: @@ -256,7 +252,7 @@ GrB_Info GB_subref // C = A(I,J): either symbolic or numeric // from phase2: &Cp, Cp_is_32, Cp_size, Cnvec_nonempty, // from phase1: - TaskList, ntasks, nthreads, post_sort, Ihead, Inext, Ihead_is_32, ndupl, + TaskList, ntasks, nthreads, post_sort, R, // from phase0: &Ch, Cj_is_32, Ci_is_32, Ch_size, Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon, nJ, diff --git a/GraphBLAS/Source/extract/GB_subref.h b/GraphBLAS/Source/extract/GB_subref.h index ff12b55633..5f990a262d 100644 --- a/GraphBLAS/Source/extract/GB_subref.h +++ b/GraphBLAS/Source/extract/GB_subref.h @@ -65,12 +65,7 @@ GrB_Info GB_I_inverse // invert the I list for C=A(I,:) int64_t nI, // length of I int64_t avlen, // length of the vectors of A // outputs: - void **p_Ihead, // head pointers for buckets, size avlen - size_t *p_Ihead_size, - void **p_Inext, // next pointers for buckets, size nI - size_t *p_Inext_size, - bool *p_Ihead_is_32, // if true, Ihead and Inext are 32-bit; else 64 - int64_t *p_nduplicates, // number of duplicate entries in I + GrB_Matrix *R_handle, // R = inverse (I) GB_Werk Werk ) ; @@ -82,12 +77,7 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref int *p_ntasks, // # of tasks constructed int *p_nthreads, // # of threads for subref operation bool *p_post_sort, // true if a final post-sort is needed - void **p_Ihead, // for I inverse, if needed; size avlen - size_t *p_Ihead_size, - void **p_Inext, // for I inverse, if needed; size nI - size_t *p_Inext_size, - bool *p_Ihead_is_32, // if true, Ihead and Inext are 32-bit; else 64 - int64_t *p_nduplicates, // # of duplicates, if I inverse computed + GrB_Matrix *R_handle, // R = inverse (I), if needed uint64_t **p_Cwork, // workspace of size max(2,C->nvec+1) size_t *p_Cwork_size, // from phase0: @@ -118,10 +108,7 @@ GrB_Info GB_subref_phase2 // count nnz in each C(:,j) GB_task_struct *restrict TaskList, // array of structs const int ntasks, // # of tasks const int nthreads, // # of threads to use - const void *Ihead, // for I inverse buckets, size A->vlen - const void *Inext, // for I inverse buckets, size nI - const bool Ihead_is_32, // if true, Ihead,Inext 32-bit; else 64 - const bool I_has_duplicates, // true if I has duplicates + const GrB_Matrix R, // R = inverse (I), if needed uint64_t **p_Cwork, // workspace of size max(2,C->nvec+1) size_t Cwork_size, // analysis from phase0: @@ -154,10 +141,7 @@ GrB_Info GB_subref_phase3 // C=A(I,J) const int ntasks, // # of tasks const int nthreads, // # of threads to use const bool post_sort, // true if post-sort needed - const void *Ihead, // for I inverse buckets, size A->vlen - const void *Inext, // for I inverse buckets, size nI - const bool Ihead_is_32, // if true, Ihead,Inext 32-bit; else 64 - const bool I_has_duplicates, // true if I has duplicates + const GrB_Matrix R, // R = inverse (I), if needed // from phase0: void **Ch_handle, const bool Cj_is_32, // if true, C->h is 32-bit; else 64-bit diff --git a/GraphBLAS/Source/extract/GB_subref_phase0.c b/GraphBLAS/Source/extract/GB_subref_phase0.c index 4d177affee..6881c3f38b 100644 --- a/GraphBLAS/Source/extract/GB_subref_phase0.c +++ b/GraphBLAS/Source/extract/GB_subref_phase0.c @@ -190,12 +190,12 @@ static inline void GB_find_Ap_start_end GB_WERK_POP (Count, uint64_t) ; \ } -#define GB_FREE_ALL \ -{ \ - GB_FREE_WORKSPACE ; \ - GB_FREE_MEMORY (&Ch, Ch_size) ; \ - GB_FREE_MEMORY (&Ap_start, Ap_start_size) ; \ - GB_FREE_MEMORY (&Ap_end, Ap_end_size) ; \ +#define GB_FREE_ALL \ +{ \ + GB_FREE_WORKSPACE ; \ + GB_FREE_MEMORY (&Ch, Ch_size) ; \ + GB_FREE_MEMORY (&Ap_start, Ap_start_size) ; \ + GB_FREE_MEMORY (&Ap_end, Ap_end_size) ; \ } GrB_Info GB_subref_phase0 diff --git a/GraphBLAS/Source/extract/GB_subref_phase2.c b/GraphBLAS/Source/extract/GB_subref_phase2.c index 818522ff7e..c2260004f6 100644 --- a/GraphBLAS/Source/extract/GB_subref_phase2.c +++ b/GraphBLAS/Source/extract/GB_subref_phase2.c @@ -26,10 +26,7 @@ GrB_Info GB_subref_phase2 // count nnz in each C(:,j) GB_task_struct *restrict TaskList, // array of structs const int ntasks, // # of tasks const int nthreads, // # of threads to use - const void *Ihead, // for I inverse buckets, size A->vlen - const void *Inext, // for I inverse buckets, size nI - const bool Ihead_is_32, // if true, Ihead,Inext 32-bit; else 64 - const bool I_has_duplicates, // true if I has duplicates + const GrB_Matrix R, // R = inverse (I), if needed uint64_t **p_Cwork, // workspace of size max(2,C->nvec+1) size_t Cwork_size, // analysis from phase0: @@ -62,8 +59,39 @@ GrB_Info GB_subref_phase2 // count nnz in each C(:,j) GB_IDECL (I , const, u) ; GB_IPTR (I , I_is_32) ; GB_IDECL (Ap_start, const, u) ; GB_IPTR (Ap_start, A->p_is_32) ; GB_IDECL (Ap_end , const, u) ; GB_IPTR (Ap_end , A->p_is_32) ; - GB_IDECL (Ihead , const, u) ; GB_IPTR (Ihead , Ihead_is_32) ; - GB_IDECL (Inext , const, u) ; GB_IPTR (Inext , Ihead_is_32) ; + + bool R_is_hyper = false ; + int64_t rnvec = 0, R_hash_bits = 0 ; + void *Rp = NULL, *Rh = NULL, *Ri = NULL ; + void *R_Yp = NULL, *R_Yi = NULL, *R_Yx = NULL ; + bool Rp_is_32 = false ; + bool Rj_is_32 = false ; + bool Ri_is_32 = false ; + GB_IDECL (Rp, const, u) ; + GB_IDECL (Rh, const, u) ; + GB_IDECL (Ri, const, u) ; + if (R != NULL) + { + R_is_hyper = GB_IS_HYPERSPARSE (R) ; + rnvec = R->nvec ; + Rp = R->p ; + Rh = R->h ; + Ri = R->i ; + GB_IPTR (Rp, R->p_is_32) ; + GB_IPTR (Rh, R->j_is_32) ; + GB_IPTR (Ri, R->i_is_32) ; + Rp_is_32 = R->p_is_32 ; + Rj_is_32 = R->j_is_32 ; + Ri_is_32 = R->i_is_32 ; + GrB_Matrix R_Y = R->Y ; + if (R_Y != NULL) + { + R_Yp = R_Y->p ; + R_Yi = R_Y->i ; + R_Yx = R_Y->x ; + R_hash_bits = (R_Y->vdim - 1) ; + } + } (*Cp_handle) = NULL ; (*Cp_size_handle) = 0 ; @@ -97,7 +125,6 @@ GrB_Info GB_subref_phase2 // count nnz in each C(:,j) #define GB_I_KIND Ikind #define GB_NEED_QSORT need_qsort - #define GB_I_HAS_DUPLICATES I_has_duplicates #define GB_ANALYSIS_PHASE if (symbolic) diff --git a/GraphBLAS/Source/extract/GB_subref_phase3.c b/GraphBLAS/Source/extract/GB_subref_phase3.c index 43112d366a..4e39916b26 100644 --- a/GraphBLAS/Source/extract/GB_subref_phase3.c +++ b/GraphBLAS/Source/extract/GB_subref_phase3.c @@ -27,10 +27,7 @@ GrB_Info GB_subref_phase3 // C=A(I,J) const int ntasks, // # of tasks const int nthreads, // # of threads to use const bool post_sort, // true if post-sort needed - const void *Ihead, // for I inverse buckets, size A->vlen - const void *Inext, // for I inverse buckets, size nI - const bool Ihead_is_32, // if true, Ihead,Inext 32-bit; else 64 - const bool I_has_duplicates, // true if I has duplicates + const GrB_Matrix R, // R = inverse (I), if needed // from phase0: void **Ch_handle, const bool Cj_is_32, // if true, C->h is 32-bit; else 64-bit @@ -78,8 +75,39 @@ GrB_Info GB_subref_phase3 // C=A(I,J) GB_IDECL (I , const, u) ; GB_IPTR (I , I_is_32) ; GB_IDECL (Ap_start, const, u) ; GB_IPTR (Ap_start, Ap_is_32) ; GB_IDECL (Ap_end , const, u) ; GB_IPTR (Ap_end , Ap_is_32) ; - GB_IDECL (Ihead , const, u) ; GB_IPTR (Ihead , Ihead_is_32) ; - GB_IDECL (Inext , const, u) ; GB_IPTR (Inext , Ihead_is_32) ; + + bool R_is_hyper = false ; + int64_t rnvec = 0, R_hash_bits = 0 ; + void *Rp = NULL, *Rh = NULL, *Ri = NULL ; + void *R_Yp = NULL, *R_Yi = NULL, *R_Yx = NULL ; + bool Rp_is_32 = false ; + bool Rj_is_32 = false ; + bool Ri_is_32 = false ; + GB_IDECL (Rp, const, u) ; + GB_IDECL (Rh, const, u) ; + GB_IDECL (Ri, const, u) ; + if (R != NULL) + { + R_is_hyper = GB_IS_HYPERSPARSE (R) ; + rnvec = R->nvec ; + Rp = R->p ; + Rh = R->h ; + Ri = R->i ; + GB_IPTR (Rp, R->p_is_32) ; + GB_IPTR (Rh, R->j_is_32) ; + GB_IPTR (Ri, R->i_is_32) ; + Rp_is_32 = R->p_is_32 ; + Rj_is_32 = R->j_is_32 ; + Ri_is_32 = R->i_is_32 ; + GrB_Matrix R_Y = R->Y ; + if (R_Y != NULL) + { + R_Yp = R_Y->p ; + R_Yi = R_Y->i ; + R_Yx = R_Y->x ; + R_hash_bits = (R_Y->vdim - 1) ; + } + } ASSERT (Cp != NULL) ; ASSERT_MATRIX_OK (A, "A for subref phase3", GB0) ; @@ -141,7 +169,6 @@ GrB_Info GB_subref_phase3 // C=A(I,J) #define GB_PHASE_2_OF_2 #define GB_I_KIND Ikind #define GB_NEED_QSORT need_qsort - #define GB_I_HAS_DUPLICATES I_has_duplicates if (symbolic) { @@ -244,8 +271,7 @@ GrB_Info GB_subref_phase3 // C=A(I,J) // using the JIT kernel info = GB_subref_sparse_jit (C, TaskList, ntasks, nthreads, post_sort, - Ihead, Inext, Ihead_is_32, I_has_duplicates, Ap_start, Ap_end, - need_qsort, Ikind, nI, Icolon, A, I, I_is_32) ; + R, Ap_start, Ap_end, need_qsort, Ikind, nI, Icolon, A, I, I_is_32) ; if (info == GrB_NO_VALUE) { diff --git a/GraphBLAS/Source/extract/GB_subref_slice.c b/GraphBLAS/Source/extract/GB_subref_slice.c index d3d8073b8d..7ba0ba6d03 100644 --- a/GraphBLAS/Source/extract/GB_subref_slice.c +++ b/GraphBLAS/Source/extract/GB_subref_slice.c @@ -40,10 +40,9 @@ #define GB_FREE_ALL \ { \ GB_FREE_WORKSPACE ; \ - GB_FREE_MEMORY (&Cwork, Cwork_size) ; \ - GB_FREE_MEMORY (&TaskList, TaskList_size) ; \ - GB_FREE_MEMORY (&Ihead, Ihead_size) ; \ - GB_FREE_MEMORY (&Inext, Inext_size) ; \ + GB_FREE_MEMORY (&Cwork, Cwork_size) ; \ + GB_FREE_MEMORY (&TaskList, TaskList_size) ; \ + GB_Matrix_free (&R) ; \ } #define GB_RETURN_RESULTS \ @@ -53,12 +52,7 @@ (*p_ntasks ) = ntasks ; \ (*p_nthreads ) = nthreads ; \ (*p_post_sort ) = post_sort ; \ - (*p_Ihead ) = Ihead ; \ - (*p_Ihead_size ) = Ihead_size ; \ - (*p_Inext ) = Inext ; \ - (*p_Inext_size ) = Inext_size ; \ - (*p_Ihead_is_32 ) = Ihead_is_32 ; \ - (*p_nduplicates ) = nduplicates ; \ + (*R_handle ) = R ; \ (*p_Cwork ) = Cwork ; \ (*p_Cwork_size ) = Cwork_size ; \ } @@ -73,12 +67,7 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref int *p_ntasks, // # of tasks constructed int *p_nthreads, // # of threads for subref operation bool *p_post_sort, // true if a final post-sort is needed - void **p_Ihead, // for I inverse, if needed; size avlen - size_t *p_Ihead_size, - void **p_Inext, // for I inverse, if needed; size nI - size_t *p_Inext_size, - bool *p_Ihead_is_32, // if true, Ihead and Inext are 32-bit; else 64 - int64_t *p_nduplicates, // # of duplicates, if I inverse computed + GrB_Matrix *R_handle, // R = inverse (I), if needed uint64_t **p_Cwork, // workspace of size max(2,C->nvec+1) size_t *p_Cwork_size, // from phase0: @@ -108,34 +97,22 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref ASSERT (p_ntasks != NULL) ; ASSERT (p_nthreads != NULL) ; ASSERT (p_post_sort != NULL) ; - ASSERT (p_Ihead != NULL) ; - ASSERT (p_Ihead_size != NULL) ; - ASSERT (p_Inext != NULL) ; - ASSERT (p_Inext_size != NULL) ; - ASSERT (p_nduplicates != NULL) ; ASSERT (p_Cwork != NULL) ; ASSERT (p_Cwork_size != NULL) ; + ASSERT (R_handle != NULL) ; ASSERT ((Cnvec > 0) == (Ap_start != NULL)) ; ASSERT ((Cnvec > 0) == (Ap_end != NULL)) ; (*p_TaskList) = NULL ; (*p_TaskList_size) = 0 ; - (*p_Ihead) = NULL ; - (*p_Inext) = NULL ; - (*p_Ihead_is_32) = false ; (*p_Cwork) = NULL ; - (*p_Ihead_size) = 0 ; - (*p_Inext_size) = 0 ; (*p_Cwork_size) = 0 ; - (*p_nduplicates) = 0 ; - void *Ihead = NULL ; size_t Ihead_size = 0 ; - void *Inext = NULL ; size_t Inext_size = 0 ; - bool Ihead_is_32 = false ; uint64_t *restrict Cwork = NULL ; size_t Cwork_size = 0 ; GB_WERK_DECLARE (Coarse, int64_t) ; // size ntasks1+1 int ntasks1 = 0 ; + GrB_Matrix R = NULL ; GrB_Info info ; @@ -167,22 +144,9 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref GB_REALLOC_TASK_WORK (TaskList, ntasks0, max_ntasks) ; //-------------------------------------------------------------------------- - // determine if I_inverse can be constructed + // to determine if R needs to be constructed //-------------------------------------------------------------------------- - // I_inverse_ok is true if I might be inverted. If false, then I will not - // be inverted. I can be inverted only if the workspace for the inverse - // does not exceed nnz(A). Note that if I was provided on input as an - // explicit list, but consists of a contiguous range imin:imax, then Ikind - // is now GB_LIST and the list I is ignored. - - // If I_inverse_ok is true, the inverse of I might still not be needed. - // need_I_inverse becomes true if any C(:,kC) = A (I,kA) computation - // requires I inverse. - - int64_t I_inverse_limit = GB_IMAX (4096, anz) ; - bool I_inverse_ok = (Ikind == GB_LIST && - ((nI > avlen / 256) || ((nI + avlen) < I_inverse_limit))) ; bool need_I_inverse = false ; bool post_sort = false ; int64_t iinc = Icolon [GxB_INC] ; @@ -221,7 +185,7 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref bool this_needs_I_inverse ; // true if this vector needs I inverse // amount of work for C(:,kC) = A (I,kA): int64_t work = GB_subref_work (&this_needs_I_inverse, alen, avlen, - Ikind, nI, I_inverse_ok, need_qsort, iinc) ; + Ikind, nI, need_qsort, iinc) ; // log the result need_I_inverse = need_I_inverse || this_needs_I_inverse ; @@ -251,13 +215,9 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref // invert I if required //-------------------------------------------------------------------------- - int64_t nduplicates = 0 ; if (need_I_inverse) { - GB_OK (GB_I_inverse (I, I_is_32, nI, avlen, &Ihead, &Ihead_size, - &Inext, &Inext_size, &Ihead_is_32, &nduplicates, Werk)) ; - ASSERT (Ihead != NULL) ; - ASSERT (Inext != NULL) ; + GB_OK (GB_I_inverse (I, I_is_32, nI, avlen, &R, Werk)) ; } //-------------------------------------------------------------------------- @@ -295,8 +255,6 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref // construct all tasks, both coarse and fine //-------------------------------------------------------------------------- - bool I_has_duplicates = (nduplicates > 0) ; - for (int t = 0 ; t < ntasks1 ; t++) { @@ -404,7 +362,7 @@ GrB_Info GB_subref_slice // phase 1 of GB_subref int64_t alen = pA_end - pA ; // nnz (A (imin:imax,j)) int method = GB_subref_method (alen, avlen, Ikind, nI, - I_inverse_ok, need_qsort, iinc, I_has_duplicates) ; + need_qsort, iinc) ; if (method == 10) { diff --git a/GraphBLAS/Source/extract/include/GB_subref_method.h b/GraphBLAS/Source/extract/include/GB_subref_method.h index 9d06cf9f23..4780e5211f 100644 --- a/GraphBLAS/Source/extract/include/GB_subref_method.h +++ b/GraphBLAS/Source/extract/include/GB_subref_method.h @@ -24,11 +24,8 @@ static inline int GB_subref_method // return the method to use (1 to 12) const int64_t avlen, // A->vlen const int Ikind, // GB_ALL, GB_RANGE, GB_STRIDE, or GB_LIST const int64_t nI, // length of I - const bool I_inverse_ok, // true if I is invertable const bool need_qsort, // true if C(:,k) requires sorting - const int64_t iinc, // increment for GB_STRIDE - const bool I_has_duplicates // true if duplicates in I - // (false if not yet known) + const int64_t iinc // increment for GB_STRIDE ) { @@ -68,8 +65,7 @@ static inline int GB_subref_method // return the method to use (1 to 12) // Case 5: C (:,k) = A (ibegin:iend,j) method = 5 ; } - else if ((Ikind == GB_LIST && !I_inverse_ok) || // must do Case 6 - (64 * nI < ajnz)) // Case 6 faster + else if (64 * nI < ajnz) // Case 6 faster in this case { // Case 6: nI not large; binary search of A(:,j) for each i in I method = 6 ; @@ -92,9 +88,9 @@ static inline int GB_subref_method // return the method to use (1 to 12) method = 9 ; } } - else // Ikind == GB_LIST, and I inverse buckets will be used + else // Ikind == GB_LIST, and R = inverse(I) will be used { - // construct the I inverse buckets + // construct the R matrix if (need_qsort) { // Case 10: nI large, need qsort @@ -102,20 +98,10 @@ static inline int GB_subref_method // return the method to use (1 to 12) // use this method, a post sort is needed when all tasks are done. method = 10 ; } - else if (I_has_duplicates) - { - // Case 11: nI large, no qsort, with duplicates - // duplicates are possible so cjnz > ajnz can hold. Note that the - // # of duplicates is only known after I is inverted, which might - // not yet be done. In that case, nuplicates is assumed to be - // zero, and Case 12 is assumed to be used instead. This is - // revised after I is inverted. - method = 11 ; - } else - { - // Case 12: nI large, no qsort, no duplicates - method = 12 ; + { + // Case 11: nI large, no qsort, duplicates are OK + method = 11 ; } } @@ -139,7 +125,6 @@ static inline int64_t GB_subref_work // return the work for a subref method const int64_t avlen, // A->vlen const int Ikind, // GB_ALL, GB_RANGE, GB_STRIDE, or GB_LIST const int64_t nI, // length of I - const bool I_inverse_ok, // true if I is invertable const bool need_qsort, // true if C(:,k) requires sorting const int64_t iinc // increment for GB_STRIDE ) @@ -149,17 +134,7 @@ static inline int64_t GB_subref_work // return the work for a subref method // get the method //-------------------------------------------------------------------------- - // nduplicates in I not yet known; it is found when I is inverted. For - // now, assume I has no duplicate entries. All that is needed for now is - // the work required for each C(:,k), and whether or not I inverse must be - // created. The # of duplicates has no impact on the I inverse decision, - // and a minor effect on the work (which is ignored). Method 11 is only - // used if I_has_duplicates is true. - - const bool I_has_duplicates = false ; // not yet known - - int method = GB_subref_method (ajnz, avlen, Ikind, nI, I_inverse_ok, - need_qsort, iinc, I_has_duplicates) ; + int method = GB_subref_method (ajnz, avlen, Ikind, nI, need_qsort, iinc) ; //-------------------------------------------------------------------------- // get the work @@ -178,10 +153,8 @@ static inline int64_t GB_subref_work // return the work for a subref method case 8 : work = ajnz ; break ; case 9 : work = ajnz ; break ; case 10 : work = ajnz * 32 ; break ; -// case 11 : -// work = ajnz * 2 ; break ; // case not determined yet default : - case 12 : work = ajnz ; break ; + case 11 : work = ajnz * 2 ; break ; } //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/extract/template/GB_subref_template.c b/GraphBLAS/Source/extract/template/GB_subref_template.c index 442ba67ad6..3da89d6e3b 100644 --- a/GraphBLAS/Source/extract/template/GB_subref_template.c +++ b/GraphBLAS/Source/extract/template/GB_subref_template.c @@ -17,11 +17,36 @@ // iso: C = A(I,J), extracting the pattern only, not the values // numeric: C = A(I,J), extracting the pattern and values -// to iterate across all entries in a bucket: -#define GB_for_each_index_in_bucket(inew,i,nI,Ihead,Inext) \ - for (uint64_t inew = GB_IGET (Ihead, i) ; \ - inew < nI ; \ - inew = GB_IGET (Inext, inew)) +// The matrix R holds the "inverse" of I, which is not actually an inverse +// since I can have duplicates. If i = I [k1] = I [k2] = I [k3], then the +// column indices in R(i,:) are {k1, k2, k3}. R is held by row, and is either +// sparse or hypersparse. + +#define GB_for_each_inew_in_I_inverse_hash(i,pR) \ + int64_t pR, pR_end ; \ + if (R_is_hyper) \ + { \ + /* R(i,:) is the kth vector in the hypersparse matrix R; */ \ + /* find k so that i = Rh [k] using the R->Y hyper_hash, */ \ + /* and set pR = Rp [k] and pR_end = Rp [k+1]. */ \ + GB_hyper_hash_lookup (Rp_is_32, Rj_is_32, \ + Rh, rnvec, Rp, R_Yp, R_Yi, R_Yx, R_hash_bits, \ + i, &pR, &pR_end) ; \ + } \ + else \ + { \ + /* R(i,:) is the ith vector in the sparse matrix R */ \ + pR = GB_IGET (Rp, i) ; /* pR = Rp [i] */ \ + pR_end = GB_IGET (Rp, i+1) ; /* pR_end = Rp [i+1] */ \ + } \ + /* for each entry in the row R(i,:) */ \ + for ( ; pR < pR_end ; pR++) + #if 0 + { + // get R(i,inew); this is the index i = I [inew] + int64_t inew = GB_IGET (Ri, pR) ; // inew = Ri [pR] + } + #endif //------------------------------------------------------------------------------ @@ -185,7 +210,7 @@ { // determine the method based on A(*,kA) and I method = GB_subref_method (alen, avlen, GB_I_KIND, nI, - (Ihead != NULL), GB_NEED_QSORT, iinc, GB_I_HAS_DUPLICATES) ; + GB_NEED_QSORT, iinc) ; } //------------------------------------------------------------------ @@ -355,12 +380,10 @@ // properties. For a fine task, A(:,kA) has not been // sliced; I has been sliced instead. - // If the I bucket inverse has not been created, this - // method is the only option. Alternatively, if nI = - // length (I) is << nnz (A (:,kA)), then scanning I and - // doing a binary search of A (:,kA) is faster than doing a - // linear-time search of A(:,kA) and a lookup into the I - // bucket inverse. + // If nI = length (I) is << nnz (A (:,kA)), then scanning I + // and doing a binary search of A (:,kA) is faster than + // doing a linear-time scan of A(:,kA) and a lookup into + // R for each row index i in A(:,kA). // The vector of C is constructed in sorted order, so no // sort is needed. @@ -513,10 +536,8 @@ case 10 : // I unsorted, and C needs qsort, duplicates OK //-------------------------------------------------------------- - // Time: with one thread: 2x slower, probably - // because of the qsort. Good speedup however. This used - // if qsort is needed but ndupl == 0. Try a method that - // needs qsort, but no duplicates? + // Time: with one thread: 2x slower, probably because of + // the qsort. Good speedup however. // Case 10 works well when I has many entries and A(:,kA) // has few entries. C(:,kC) must be sorted after this pass. @@ -524,15 +545,16 @@ ASSERT (GB_I_KIND == GB_LIST) ; for (int64_t k = 0 ; k < alen ; k++) { - // A(i,kA) present, look it up in the I inverse buckets + // A(i,kA) present, look it up in R(i,:) int64_t i = GB_IGET (Ai, pA + k) ; #if defined ( GB_SYMBOLIC ) i = GB_UNZOMBIE (i) ; #endif - // traverse bucket i for all indices inew where + // traverse R(i,:) for all indices inew where // i == I [inew] or where i is from a colon expression - GB_for_each_index_in_bucket (inew, i, nI, Ihead, Inext) + GB_for_each_inew_in_I_inverse_hash (i,pR) { + int64_t inew = GB_IGET (Ri, pR) ; // inew = Ri [pR] ASSERT (inew >= 0 && inew < nI) ; ASSERT (i == GB_IJLIST (I, inew, GB_I_KIND,Icolon)); #if defined ( GB_ANALYSIS_PHASE ) @@ -564,7 +586,7 @@ break ; //-------------------------------------------------------------- - case 11 : // I not contiguous, with duplicates. No qsort needed + case 11 : // I not contiguous, duplicates OK. No qsort needed //-------------------------------------------------------------- // Case 11 works well when I has many entries and A(:,kA) @@ -575,15 +597,16 @@ ASSERT (GB_I_KIND == GB_LIST) ; for (int64_t k = 0 ; k < alen ; k++) { - // A(i,kA) present, look it up in the I inverse buckets + // A(i,kA) present, look it up in R(i,:) int64_t i = GB_IGET (Ai, pA + k) ; #if defined ( GB_SYMBOLIC ) i = GB_UNZOMBIE (i) ; #endif - // traverse bucket i for all indices inew where + // traverse R(i,:) for all indices inew where // i == I [inew] or where i is from a colon expression - GB_for_each_index_in_bucket (inew, i, nI, Ihead, Inext) + GB_for_each_inew_in_I_inverse_hash (i,pR) { + int64_t inew = GB_IGET (Ri, pR) ; // inew = Ri [pR] ASSERT (inew >= 0 && inew < nI) ; ASSERT (i == GB_IJLIST (I, inew, GB_I_KIND,Icolon)); #if defined ( GB_ANALYSIS_PHASE ) @@ -601,43 +624,6 @@ #endif break ; - //-------------------------------------------------------------- - case 12 : // I not contiguous, no duplicates. No qsort needed. - //-------------------------------------------------------------- - - // Identical to Case 11, except GB_for_each_index_in_bucket - // just needs to iterate 0 or 1 times. Works well when I - // has many entries and A(:,kA) has few entries. - - ASSERT (GB_I_KIND == GB_LIST && !GB_I_HAS_DUPLICATES) - for (int64_t k = 0 ; k < alen ; k++) - { - // A(i,kA) present, look it up in the I inverse buckets - int64_t i = GB_IGET (Ai, pA + k) ; - #if defined ( GB_SYMBOLIC ) - i = GB_UNZOMBIE (i) ; - #endif - // bucket i has at most one index inew such that - // i == I [inew] - uint64_t inew = GB_IGET (Ihead, i) ; - if (inew < nI) - { - ASSERT (i == GB_IJLIST (I, inew, GB_I_KIND,Icolon)); - #if defined ( GB_ANALYSIS_PHASE ) - clen++ ; - #else - GB_ISET (Ci, pC, inew) ; // Ci [pC] = inew ; - GB_COPY_ENTRY (pC, pA + k) ; - pC++ ; - #endif - } - } - - #if defined ( GB_PHASE_2_OF_2 ) - ASSERT (pC == pC_end) ; - #endif - break ; - //-------------------------------------------------------------- default: ; //-------------------------------------------------------------- @@ -708,7 +694,7 @@ #endif } -#undef GB_for_each_index_in_bucket +#undef GB_for_each_inew_in_I_inverse_hash #undef GB_COPY_RANGE #undef GB_COPY_ENTRY #undef GB_SYMBOLIC diff --git a/GraphBLAS/Source/gateway/GB_cuda_gateway.h b/GraphBLAS/Source/gateway/GB_cuda_gateway.h index 06fdf642c0..ed5b640180 100644 --- a/GraphBLAS/Source/gateway/GB_cuda_gateway.h +++ b/GraphBLAS/Source/gateway/GB_cuda_gateway.h @@ -75,7 +75,7 @@ static inline int GB_ngpus_to_use else if (gpu_hack == 1) { // always use all available GPU(s) - // Fixme for CUDA: allow 1 to gpu_count to be requested + // FIXME for CUDA: allow 1 to gpu_count to be requested return (gpu_count) ; } else diff --git a/GraphBLAS/Source/global/GB_Global.c b/GraphBLAS/Source/global/GB_Global.c index c2a010509c..fc7e0a237c 100644 --- a/GraphBLAS/Source/global/GB_Global.c +++ b/GraphBLAS/Source/global/GB_Global.c @@ -253,7 +253,7 @@ static GB_Global_struct GB_Global = .gpu_count = 0, // # of GPUs in the system // OpenMP locks - .lock_is_created = {0, 0, 0, 0}, + .lock_is_created = {0, 0, 0, 0, 0, 0, 0, 0}, // of size GB_GLOBAL_NLOCKS } ; //============================================================================== diff --git a/GraphBLAS/Source/init/GB_init.c b/GraphBLAS/Source/init/GB_init.c index d84e51b036..742392baeb 100644 --- a/GraphBLAS/Source/init/GB_init.c +++ b/GraphBLAS/Source/init/GB_init.c @@ -88,7 +88,7 @@ GrB_Info GB_init // start up GraphBLAS bool malloc_is_thread_safe = true ; #if defined ( GRAPHBLAS_HAS_CUDA ) - mode = GxB_NONBLOCKING_GPU ; // HACK Fixme for CUDA: force GPU to be used + mode = GxB_NONBLOCKING_GPU ; // HACK FIXME for CUDA: force GPU to be used if (mode == GxB_NONBLOCKING_GPU || mode == GxB_BLOCKING_GPU) { // ignore the memory management function pointers and use rmm_wrap_* @@ -206,9 +206,9 @@ GrB_Info GB_init // start up GraphBLAS #pragma omp flush #if defined ( GRAPHBLAS_HAS_CUDA ) // this hack_get setting is used by GB_ngpus_to_use: -// GB_Global_hack_set (2,0) ; // HACK Fixme for CUDA: default: GPU for big enough probs - GB_Global_hack_set (2,1) ; // HACK Fixme for CUDA: force the GPU always to be used -// GB_Global_hack_set (2,2) ; // HACK Fixme for CUDA: force the GPU never to be used +// GB_Global_hack_set (2,0) ; // HACK FIXME for CUDA: default: GPU for big enough probs + GB_Global_hack_set (2,1) ; // HACK FIXME for CUDA: force the GPU always to be used +// GB_Global_hack_set (2,2) ; // HACK FIXME for CUDA: force the GPU never to be used #endif return (GrB_SUCCESS) ; diff --git a/GraphBLAS/Source/init/GrB_init.c b/GraphBLAS/Source/init/GrB_init.c index 8a3498faa0..eb335e70f5 100644 --- a/GraphBLAS/Source/init/GrB_init.c +++ b/GraphBLAS/Source/init/GrB_init.c @@ -12,7 +12,7 @@ // and its RMM memory manager: use a mode of GxB_BLOCKING_GPU or // GxB_NONBLOCKING_GPU. -// Fixme for CUDA: rename GxB_*BLOCKING_GPU to GxB_*BLOCKING_CUDA. +// FIXME for CUDA: rename GxB_*BLOCKING_GPU to GxB_*BLOCKING_CUDA. #include "GB.h" #include "init/GB_init.h" diff --git a/GraphBLAS/Source/jit_kernels/include/GB_jit_kernel_proto.h b/GraphBLAS/Source/jit_kernels/include/GB_jit_kernel_proto.h index a9244c3bb7..5bdcb65789 100644 --- a/GraphBLAS/Source/jit_kernels/include/GB_jit_kernel_proto.h +++ b/GraphBLAS/Source/jit_kernels/include/GB_jit_kernel_proto.h @@ -662,8 +662,7 @@ GrB_Info GB_jit_kernel_subref_sparse \ const int ntasks, \ const int nthreads, \ const bool post_sort, \ - const void *Ihead_input, \ - const void *Inext_input, \ + const GrB_Matrix R, \ const void *Ap_start_input, \ const void *Ap_end_input, \ const int64_t nI, \ diff --git a/GraphBLAS/Source/jit_kernels/template/GB_jit_kernel_subref_sparse.c b/GraphBLAS/Source/jit_kernels/template/GB_jit_kernel_subref_sparse.c index 845bc4a36e..78e83c4262 100644 --- a/GraphBLAS/Source/jit_kernels/template/GB_jit_kernel_subref_sparse.c +++ b/GraphBLAS/Source/jit_kernels/template/GB_jit_kernel_subref_sparse.c @@ -37,9 +37,20 @@ GB_JIT_GLOBAL GB_JIT_KERNEL_SUBREF_SPARSE_PROTO (GB_jit_kernel) // get I const GB_I_TYPE *restrict I = I_input ; - // get I inverse lists - const GB_IHEAD_TYPE *restrict Ihead = Ihead_input ; - const GB_IHEAD_TYPE *restrict Inext = Inext_input ; + // get R for the I inverse data structure + GB_Rp_DECLARE (Rp, const) ; GB_Rp_PTR (Rp, R) ; + GB_Rh_DECLARE (Rh, const) ; GB_Rh_PTR (Rh, R) ; + GB_Ri_DECLARE_U (Ri, const) ; GB_Ri_PTR (Ri, R) ; + GrB_Matrix R_Y = (R == NULL) ? NULL : R->Y ; + const void *R_Yp = (R_Y == NULL) ? NULL : R_Y->p ; + const void *R_Yi = (R_Y == NULL) ? NULL : R_Y->i ; + const void *R_Yx = (R_Y == NULL) ? NULL : R_Y->x ; + const int64_t R_hash_bits = (R_Y == NULL) ? 0 : (R_Y->vdim - 1) ; + #define R_is_hyper GB_R_IS_HYPER + #define Rp_is_32 GB_Rp_IS_32 + #define Rj_is_32 GB_Rj_IS_32 + #define Ri_is_32 GB_Ri_IS_32 + int64_t rnvec = (R == NULL) ? 0 : R->nvec ; #ifndef GB_Ai_IS_32 #define GB_Ai_IS_32 (GB_Ai_BITS == 32) diff --git a/GraphBLAS/Source/jit_wrappers/GB_masker_phase1_jit.c b/GraphBLAS/Source/jit_wrappers/GB_masker_phase1_jit.c index f43fe6b915..410939a585 100644 --- a/GraphBLAS/Source/jit_wrappers/GB_masker_phase1_jit.c +++ b/GraphBLAS/Source/jit_wrappers/GB_masker_phase1_jit.c @@ -29,6 +29,7 @@ GrB_Info GB_masker_phase1_jit // count nnz in each R(:,j) const int64_t *restrict R_to_Z, const bool Rp_is_32, // if true, Rp is 32-bit; else 64-bit const bool Rj_is_32, // if true, Rh is 32-bit; else 64-bit + const int R_sparsity, // GxB_SPARSE or GxB_HYPERSPARSE // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -45,7 +46,8 @@ GrB_Info GB_masker_phase1_jit // count nnz in each R(:,j) GB_jit_encoding encoding ; char *suffix ; uint64_t hash = GB_encodify_masker (&encoding, &suffix, - GB_JIT_KERNEL_MASKER_PHASE1, NULL, Rp_is_32, Rj_is_32, false, + GB_JIT_KERNEL_MASKER_PHASE1, R_sparsity, /* rtype: */ NULL, + Rp_is_32, Rj_is_32, /* Ri is not accessed: */ false, M, Mask_struct, Mask_comp, C, Z) ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/jit_wrappers/GB_masker_phase2_jit.c b/GraphBLAS/Source/jit_wrappers/GB_masker_phase2_jit.c index c014467352..7897a49ef9 100644 --- a/GraphBLAS/Source/jit_wrappers/GB_masker_phase2_jit.c +++ b/GraphBLAS/Source/jit_wrappers/GB_masker_phase2_jit.c @@ -23,6 +23,7 @@ GrB_Info GB_masker_phase2_jit // phase2 for R = masker (C,M,Z) const int64_t *restrict R_to_M, const int64_t *restrict R_to_C, const int64_t *restrict R_to_Z, + const int R_sparsity, // any sparsity format // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -45,7 +46,8 @@ GrB_Info GB_masker_phase2_jit // phase2 for R = masker (C,M,Z) GB_jit_encoding encoding ; char *suffix ; uint64_t hash = GB_encodify_masker (&encoding, &suffix, - GB_JIT_KERNEL_MASKER_PHASE2, R, R->p_is_32, R->j_is_32, R->i_is_32, + GB_JIT_KERNEL_MASKER_PHASE2, R_sparsity, R->type, + R->p_is_32, R->j_is_32, R->i_is_32, M, Mask_struct, Mask_comp, C, Z) ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/jit_wrappers/GB_subref_bitmap_jit.c b/GraphBLAS/Source/jit_wrappers/GB_subref_bitmap_jit.c index bb851ec25f..8b325ca7d9 100644 --- a/GraphBLAS/Source/jit_wrappers/GB_subref_bitmap_jit.c +++ b/GraphBLAS/Source/jit_wrappers/GB_subref_bitmap_jit.c @@ -42,7 +42,7 @@ GrB_Info GB_subref_bitmap_jit char *suffix ; uint64_t hash = GB_encodify_subref (&encoding, &suffix, GB_JIT_KERNEL_BITMAP_SUBREF, C, I_is_32, J_is_32, - Ikind, Jkind, false, false, false, A) ; + Ikind, Jkind, false, NULL, A) ; //-------------------------------------------------------------------------- // get the kernel function pointer, loading or compiling it if needed diff --git a/GraphBLAS/Source/jit_wrappers/GB_subref_sparse_jit.c b/GraphBLAS/Source/jit_wrappers/GB_subref_sparse_jit.c index 56d29ce426..69ec86fa1c 100644 --- a/GraphBLAS/Source/jit_wrappers/GB_subref_sparse_jit.c +++ b/GraphBLAS/Source/jit_wrappers/GB_subref_sparse_jit.c @@ -21,10 +21,7 @@ GrB_Info GB_subref_sparse_jit const int ntasks, // # of tasks const int nthreads, // # of threads to use const bool post_sort, // true if post-sort needed - const void *Ihead, // for I inverse buckets, size A->vlen - const void *Inext, // for I inverse buckets, size nI - const bool Ihead_is_32, // if true, Ihead/Inext 32-bit; else 64 - const bool I_has_duplicates, // true if I has duplicates + const GrB_Matrix R, // R = inverse (I), if needed // from phase0: const void *Ap_start, const void *Ap_end, @@ -47,7 +44,7 @@ GrB_Info GB_subref_sparse_jit char *suffix ; uint64_t hash = GB_encodify_subref (&encoding, &suffix, GB_JIT_KERNEL_SUBREF_SPARSE, C, I_is_32, false, Ikind, 0, - need_qsort, Ihead_is_32, I_has_duplicates, A) ; + need_qsort, R, A) ; //-------------------------------------------------------------------------- // get the kernel function pointer, loading or compiling it if needed @@ -66,7 +63,7 @@ GrB_Info GB_subref_sparse_jit #include "include/GB_pedantic_disable.h" GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; - return (GB_jit_kernel (C, TaskList, ntasks, nthreads, post_sort, Ihead, - Inext, Ap_start, Ap_end, nI, Icolon, A, I, &GB_callback)) ; + return (GB_jit_kernel (C, TaskList, ntasks, nthreads, post_sort, R, + Ap_start, Ap_end, nI, Icolon, A, I, &GB_callback)) ; } diff --git a/GraphBLAS/Source/jitifyer/GB_encodify_masker.c b/GraphBLAS/Source/jitifyer/GB_encodify_masker.c index f1588eca6c..628f881b48 100644 --- a/GraphBLAS/Source/jitifyer/GB_encodify_masker.c +++ b/GraphBLAS/Source/jitifyer/GB_encodify_masker.c @@ -20,7 +20,8 @@ uint64_t GB_encodify_masker // encode a masker problem char **suffix, // suffix for user-defined kernel // input: const GB_jit_kcode kcode, // kernel to encode - const GrB_Matrix R, // may be NULL, for phase1 + const int R_sparsity, // any sparsity format + const GrB_Type rtype, const bool Rp_is_32, // if true, R->p is 32 bit; else 64 bit const bool Rj_is_32, // if true, R->h is 32 bit; else 64 bit const bool Ri_is_32, // if true, R->i is 32 bit; else 64 bit @@ -33,11 +34,10 @@ uint64_t GB_encodify_masker // encode a masker problem { //-------------------------------------------------------------------------- - // check if the R->type is JIT'able + // check if the rtype is JIT'able //-------------------------------------------------------------------------- - GrB_Type rtype = (R == NULL) ? NULL : R->type ; - if (R != NULL && rtype->hash == UINT64_MAX) + if (rtype != NULL && rtype->hash == UINT64_MAX) { // cannot JIT this type memset (encoding, 0, sizeof (GB_jit_encoding)) ; @@ -50,7 +50,8 @@ uint64_t GB_encodify_masker // encode a masker problem //-------------------------------------------------------------------------- GB_encodify_kcode (encoding, kcode) ; - GB_enumify_masker (&encoding->code, R, Rp_is_32, Rj_is_32, Ri_is_32, + GB_enumify_masker (&encoding->code, R_sparsity, rtype, + Rp_is_32, Rj_is_32, Ri_is_32, M, Mask_struct, Mask_comp, C, Z) ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/jitifyer/GB_encodify_subref.c b/GraphBLAS/Source/jitifyer/GB_encodify_subref.c index d9d030a020..99e030731f 100644 --- a/GraphBLAS/Source/jitifyer/GB_encodify_subref.c +++ b/GraphBLAS/Source/jitifyer/GB_encodify_subref.c @@ -26,8 +26,7 @@ uint64_t GB_encodify_subref // encode an subref problem int Ikind, // 0: all (no I), 1: range, 2: stride, 3: list int Jkind, // ditto, or 0 if not used bool need_qsort, // true if qsort needs to be called - bool Ihead_is_32, // if true, Ihead/Inext 32-bit; else 64 - bool I_has_duplicates, // true if I has duplicate entries + GrB_Matrix R, // A matrix: GrB_Matrix A ) @@ -51,8 +50,7 @@ uint64_t GB_encodify_subref // encode an subref problem GB_encodify_kcode (encoding, kcode) ; GB_enumify_subref (&encoding->code, - C, I_is_32, J_is_32, Ikind, Jkind, need_qsort, Ihead_is_32, - I_has_duplicates, A) ; + C, I_is_32, J_is_32, Ikind, Jkind, need_qsort, R, A) ; //-------------------------------------------------------------------------- // determine the suffix and its length diff --git a/GraphBLAS/Source/jitifyer/GB_enumify_masker.c b/GraphBLAS/Source/jitifyer/GB_enumify_masker.c index 2f87cce1bd..b6bbff0dca 100644 --- a/GraphBLAS/Source/jitifyer/GB_enumify_masker.c +++ b/GraphBLAS/Source/jitifyer/GB_enumify_masker.c @@ -15,7 +15,8 @@ void GB_enumify_masker // enumify a masker problem // output: uint64_t *method_code, // unique encoding of the entire operation // input: - const GrB_Matrix R, // NULL for phase 1 + const int R_sparsity, // any sparsity format + const GrB_Type rtype, // the type of R (NULL for phase1) const bool Rp_is_32, // if true, R->p is 32-bit; else 64-bit const bool Rj_is_32, // if true, R->h is 32-bit; else 64-bit const bool Ri_is_32, // if true, R->i is 32-bit; else 64-bit @@ -28,12 +29,11 @@ void GB_enumify_masker // enumify a masker problem { //-------------------------------------------------------------------------- - // get the types of R, C, and Z + // check inputs //-------------------------------------------------------------------------- - GrB_Type rtype = (R == NULL) ? NULL : R->type ; - ASSERT (GB_IMPLIES (R != NULL, rtype == C->type)) ; - ASSERT (GB_IMPLIES (R != NULL, rtype == Z->type)) ; + ASSERT (GB_IMPLIES (rtype != NULL, rtype == C->type)) ; + ASSERT (GB_IMPLIES (rtype != NULL, rtype == Z->type)) ; //-------------------------------------------------------------------------- // enumify the types @@ -55,7 +55,6 @@ void GB_enumify_masker // enumify a masker problem // enumify the sparsity structures of R, C, M, and Z //-------------------------------------------------------------------------- - int R_sparsity = GB_sparsity (R) ; int C_sparsity = GB_sparsity (C) ; int M_sparsity = GB_sparsity (M) ; int Z_sparsity = GB_sparsity (Z) ; diff --git a/GraphBLAS/Source/jitifyer/GB_enumify_subref.c b/GraphBLAS/Source/jitifyer/GB_enumify_subref.c index 7cc51f38c0..0770de90eb 100644 --- a/GraphBLAS/Source/jitifyer/GB_enumify_subref.c +++ b/GraphBLAS/Source/jitifyer/GB_enumify_subref.c @@ -24,8 +24,7 @@ void GB_enumify_subref // enumerate a GrB_extract problem int Ikind, // 0: all (no I), 1: range, 2: stride, 3: list int Jkind, // ditto, or 0 if not used bool need_qsort, // true if qsort needs to be called - bool Ihead_is_32, // if true, Ihead/Inext 32-bit; else 64 - bool I_has_duplicates, // true if I has duplicate entries + GrB_Matrix R, // A matrix: GrB_Matrix A ) @@ -45,12 +44,13 @@ void GB_enumify_subref // enumerate a GrB_extract problem int C_sparsity = GB_sparsity (C) ; int A_sparsity = GB_sparsity (A) ; - int csparsity, asparsity ; + int R_sparsity = GB_sparsity (R) ; + int csparsity, asparsity, rsparsity ; GB_enumify_sparsity (&csparsity, C_sparsity) ; GB_enumify_sparsity (&asparsity, A_sparsity) ; + GB_enumify_sparsity (&rsparsity, R_sparsity) ; int needqsort = (need_qsort) ? 1 : 0 ; - int ihasdupl = (I_has_duplicates) ? 1 : 0 ; int i_is_32 = (I_is_32) ? 1 : 0 ; int j_is_32 = (J_is_32) ? 1 : 0 ; @@ -63,19 +63,26 @@ void GB_enumify_subref // enumerate a GrB_extract problem int aj_is_32 = (A->j_is_32) ? 1 : 0 ; int ai_is_32 = (A->i_is_32) ? 1 : 0 ; - int ihead_is_32 = (Ihead_is_32) ? 1 : 0 ; + int rp_is_32 = (R != NULL && R->p_is_32) ? 1 : 0 ; + int rj_is_32 = (R != NULL && R->j_is_32) ? 1 : 0 ; + int ri_is_32 = (R != NULL && R->i_is_32) ? 1 : 0 ; //-------------------------------------------------------------------------- // construct the subref method_code //-------------------------------------------------------------------------- - // total method_code bits: 23 (6 hex digits) + // total method_code bits: 28 (7 hex digits) (*method_code) = // range bits - // C, A integer sizes (2 hex digits) - GB_LSHIFT (ihead_is_32, 22) | // 0 to 1 1 + // R integer sizes and sparsity + GB_LSHIFT (rp_is_32 , 27) | // 0 to 1 1 + GB_LSHIFT (rj_is_32 , 26) | // 0 to 1 1 + GB_LSHIFT (ri_is_32 , 25) | // 0 to 1 1 + GB_LSHIFT (rsparsity , 23) | // 0 to 3 2 + // 22: unused + // C, A integer sizes (2 hex digits) GB_LSHIFT (cp_is_32 , 21) | // 0 to 1 1 GB_LSHIFT (cj_is_32 , 20) | // 0 to 1 1 GB_LSHIFT (ci_is_32 , 19) | // 0 to 1 1 @@ -84,10 +91,10 @@ void GB_enumify_subref // enumerate a GrB_extract problem GB_LSHIFT (aj_is_32 , 17) | // 0 to 1 1 GB_LSHIFT (ai_is_32 , 16) | // 0 to 1 1 - // need_qsort, I_has_duplicates, I and J bits (1 hex digit) + // need_qsort, I and J bits (1 hex digit) GB_LSHIFT (i_is_32 , 15) | // 0 to 1 1 GB_LSHIFT (j_is_32 , 14) | // 0 to 1 1 - GB_LSHIFT (ihasdupl , 13) | // 0 to 1 1 + // 13: unused GB_LSHIFT (needqsort , 12) | // 0 to 1 1 // Ikind, Jkind (1 hex digit) diff --git a/GraphBLAS/Source/jitifyer/GB_jitifyer.c b/GraphBLAS/Source/jitifyer/GB_jitifyer.c index e2b5dc3cbc..385eedd226 100644 --- a/GraphBLAS/Source/jitifyer/GB_jitifyer.c +++ b/GraphBLAS/Source/jitifyer/GB_jitifyer.c @@ -1870,7 +1870,7 @@ GrB_Info GB_jitifyer_load2_worker break ; case GB_jit_subref_family : - method_code_digits = 6 ; + method_code_digits = 7 ; break ; case GB_jit_sort_family : @@ -2580,18 +2580,19 @@ void GB_jitifyer_nvcc_compile // compile: "sh -c \"" // execute with POSIX shell - // Fixme for CUDA: use GB_CUDA_COMPILER here: + // FIXME for CUDA: use GB_CUDA_COMPILER here: "nvcc --version ; " "nvcc " // compiler command "-forward-unknown-to-host-compiler " "-DGB_JIT_RUNTIME=1 " // nvcc flags - // Fixme for CUDA: add GB_CUDA_INC here: + // FIXME for CUDA: add GB_CUDA_INC here: "-I/usr/local/cuda/include -std=c++17 " " --gpu-architecture=compute_%d%d" // major,minor " --gpu-code=sm_%d%d " // major,minor " -fPIC " - // Fixme for CUDA: add GB_CUDA_FLAGS here: - " -O3 " // HACK Fixme for CUDA + // FIXME for CUDA: add GB_CUDA_FLAGS here: + " -O3 " // HACK FIXME for CUDA + " -Wno-deprecated-gpu-targets " "-I'%s/src' " // include source directory "-I'%s/src/template' " "-I'%s/src/include' " @@ -2604,6 +2605,7 @@ void GB_jitifyer_nvcc_compile "nvcc " // compiler "-DGB_JIT_RUNTIME=1 " // nvcc flags "-I/usr/local/cuda/include -std=c++17 " + " -Wno-deprecated-gpu-targets " " --gpu-architecture=compute_%d%d" // major,minor " --gpu-code=sm_%d%d " // major,minor " -shared " diff --git a/GraphBLAS/Source/jitifyer/GB_macrofy_subref.c b/GraphBLAS/Source/jitifyer/GB_macrofy_subref.c index f05a61f42b..dbad58bbdb 100644 --- a/GraphBLAS/Source/jitifyer/GB_macrofy_subref.c +++ b/GraphBLAS/Source/jitifyer/GB_macrofy_subref.c @@ -24,9 +24,13 @@ void GB_macrofy_subref // construct all macros for GrB_extract // extract the subref method_code //-------------------------------------------------------------------------- - // C, A integer sizes (2 hex digits) - bool Ihead_is_32 = GB_RSHIFT (method_code, 22, 1) ; + // R integer sizes and sparsity + bool Rp_is_32 = GB_RSHIFT (method_code, 27, 1) ; + bool Rj_is_32 = GB_RSHIFT (method_code, 26, 1) ; + bool Ri_is_32 = GB_RSHIFT (method_code, 25, 1) ; + int rsparsity = GB_RSHIFT (method_code, 23, 2) ; + // C, A integer sizes (2 hex digits) bool Cp_is_32 = GB_RSHIFT (method_code, 21, 1) ; bool Cj_is_32 = GB_RSHIFT (method_code, 20, 1) ; bool Ci_is_32 = GB_RSHIFT (method_code, 19, 1) ; @@ -35,10 +39,10 @@ void GB_macrofy_subref // construct all macros for GrB_extract bool Aj_is_32 = GB_RSHIFT (method_code, 17, 1) ; bool Ai_is_32 = GB_RSHIFT (method_code, 16, 1) ; - // need_qsort, I_has_duplicates, I and J bits (1 hex digit) + // need_qsort, I and J bits (1 hex digit) bool I_is_32 = GB_RSHIFT (method_code, 15, 1) ; bool J_is_32 = GB_RSHIFT (method_code, 14, 1) ; - int ihasdupl = GB_RSHIFT (method_code, 13, 1) ; + // 13: unused int needqsort = GB_RSHIFT (method_code, 12, 1) ; // Ikind, Jkind (1 hex digit) @@ -74,12 +78,11 @@ void GB_macrofy_subref // construct all macros for GrB_extract // C and A are sparse/hypersparse // Jkind not needed for sparse subsref fprintf (fp, "#define GB_NEED_QSORT %d\n", needqsort) ; - fprintf (fp, "#define GB_I_HAS_DUPLICATES %d\n", ihasdupl) ; } else { // C and A are bitmap/full - // need_qsort, I_has_duplicates not needed for bitmap subsref + // need_qsort not needed for bitmap subsref fprintf (fp, "#define GB_J_KIND ") ; switch (Jkind) { @@ -92,8 +95,6 @@ void GB_macrofy_subref // construct all macros for GrB_extract fprintf (fp, "#define GB_J_TYPE uint%d_t\n", J_is_32 ? 32 : 64) ; } - fprintf (fp, "#define GB_IHEAD_TYPE uint%d_t\n", Ihead_is_32 ? 32 : 64) ; - //-------------------------------------------------------------------------- // construct the typedefs //-------------------------------------------------------------------------- @@ -101,7 +102,7 @@ void GB_macrofy_subref // construct all macros for GrB_extract GB_macrofy_typedefs (fp, ctype, NULL, NULL, NULL, NULL, NULL, NULL) ; //-------------------------------------------------------------------------- - // construct the macros for C and A + // construct the macros for C, A, and R //-------------------------------------------------------------------------- GB_macrofy_sparsity (fp, "C", csparsity) ; @@ -115,6 +116,11 @@ void GB_macrofy_subref // construct all macros for GrB_extract GB_macrofy_type (fp, "A", "_", atype->name) ; GB_macrofy_bits (fp, "A", Ap_is_32, Aj_is_32, Ai_is_32) ; + // R is always GrB_UINT64, and iso-valued (its values are not used) + GB_macrofy_sparsity (fp, "R", rsparsity) ; + GB_macrofy_nvals (fp, "R", rsparsity, false) ; + GB_macrofy_bits (fp, "R", Rp_is_32, Rj_is_32, Ri_is_32) ; + //-------------------------------------------------------------------------- // include the final default definitions //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/jitifyer/GB_stringify.h b/GraphBLAS/Source/jitifyer/GB_stringify.h index 677087166f..188f559617 100644 --- a/GraphBLAS/Source/jitifyer/GB_stringify.h +++ b/GraphBLAS/Source/jitifyer/GB_stringify.h @@ -1573,6 +1573,7 @@ GrB_Info GB_masker_phase1_jit // count nnz in each R(:,j) const int64_t *restrict R_to_Z, const bool Rp_is_32, // if true, Rp is 32-bit; else 64-bit const bool Rj_is_32, // if true, Rh is 32-bit; else 64-bit + const int R_sparsity, // GxB_SPARSE or GxB_HYPERSPARSE // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -1592,6 +1593,7 @@ GrB_Info GB_masker_phase2_jit // phase2 for R = masker (C,M,Z) const int64_t *restrict R_to_M, const int64_t *restrict R_to_C, const int64_t *restrict R_to_Z, + const int R_sparsity, // any sparsity format // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -1614,7 +1616,8 @@ uint64_t GB_encodify_masker // encode a masker problem char **suffix, // suffix for user-defined kernel // input: const GB_jit_kcode kcode, // kernel to encode - const GrB_Matrix R, // may be NULL, for phase1 + const int R_sparsity, // GxB_SPARSE or GxB_HYPERSPARSE + const GrB_Type rtype, const bool Rp_is_32, // if true, R->p is 32 bit; else 64 bit const bool Rj_is_32, // if true, R->h is 32 bit; else 64 bit const bool Ri_is_32, // if true, R->i is 32 bit; else 64 bit @@ -1630,7 +1633,8 @@ void GB_enumify_masker // enumify a masker problem // output: uint64_t *method_code, // unique encoding of the entire operation // input: - const GrB_Matrix R, // NULL for phase 1 + const int R_sparsity, // GxB_SPARSE or GxB_HYPERSPARSE + const GrB_Type rtype, // the type of R (NULL for phase1) const bool Rp_is_32, // if true, R->p is 32-bit; else 64-bit const bool Rj_is_32, // if true, R->h is 32-bit; else 64-bit const bool Ri_is_32, // if true, R->i is 32-bit; else 64-bit @@ -1670,8 +1674,7 @@ uint64_t GB_encodify_subref // encode an subref problem int Ikind, // 0: all (no I), 1: range, 2: stride, 3: list int Jkind, // ditto, or 0 if not used bool need_qsort, // true if qsort needs to be called - bool Ihead_is_32, // if true, Ihead/Inext 32-bit; else 64 - bool I_has_duplicates, // true if I has duplicate entries + const GrB_Matrix R, // R = inverse (I), if needed // A matrix: GrB_Matrix A ) ; @@ -1688,8 +1691,7 @@ void GB_enumify_subref // enumerate a GrB_extract problem int Ikind, // 0: all (no I), 1: range, 2: stride, 3: list int Jkind, // ditto, or 0 if not used bool need_qsort, // true if qsort needs to be called - bool Ihead_is_32, // if true, Ihead/Inext 32-bit; else 64 - bool I_has_duplicates, // true if I has duplicate entries + const GrB_Matrix R, // R = inverse (I), if needed // A matrix: GrB_Matrix A ) ; @@ -1712,10 +1714,7 @@ GrB_Info GB_subref_sparse_jit const int ntasks, // # of tasks const int nthreads, // # of threads to use const bool post_sort, // true if post-sort needed - const void *Ihead, // for I inverse buckets, size A->vlen - const void *Inext, // for I inverse buckets, size nI - const bool Ihead_is_32, // if true, Ihead/Inext 32-bit; else 64 - const bool I_has_duplicates, // true if I has duplicates + const GrB_Matrix R, // R = inverse (I), if needed // from phase0: const void *Ap_start, const void *Ap_end, diff --git a/GraphBLAS/Source/mask/GB_mask.h b/GraphBLAS/Source/mask/GB_mask.h index a5286fc665..0c2b5b113f 100644 --- a/GraphBLAS/Source/mask/GB_mask.h +++ b/GraphBLAS/Source/mask/GB_mask.h @@ -54,6 +54,7 @@ GrB_Info GB_masker_phase1 // count nnz in each R(:,j) const int64_t *restrict R_to_Z, const bool Rp_is_32, const bool Rj_is_32, + const int R_sparsity, // GxB_SPARSE or GxB_HYPERSPARSE // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -85,7 +86,7 @@ GrB_Info GB_masker_phase2 // phase2 for R = masker (C,M,Z) const bool Rp_is_32, const bool Rj_is_32, const bool Ri_is_32, - const int R_sparsity, + const int R_sparsity, // any sparsity format // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented diff --git a/GraphBLAS/Source/mask/GB_masker.c b/GraphBLAS/Source/mask/GB_masker.c index 8fe8cc2d79..eb1eaf546f 100644 --- a/GraphBLAS/Source/mask/GB_masker.c +++ b/GraphBLAS/Source/mask/GB_masker.c @@ -208,7 +208,7 @@ GrB_Info GB_masker // R = masker (C, M, Z) // from phase1a: TaskList, R_ntasks, R_nthreads, // from phase0: - Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32, + Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32, R_sparsity, // original input: M, Mask_comp, Mask_struct, C, Z, Werk) ; if (info != GrB_SUCCESS) diff --git a/GraphBLAS/Source/mask/GB_masker_phase1.c b/GraphBLAS/Source/mask/GB_masker_phase1.c index 5e23d999b5..c7c6a0eaf9 100644 --- a/GraphBLAS/Source/mask/GB_masker_phase1.c +++ b/GraphBLAS/Source/mask/GB_masker_phase1.c @@ -43,6 +43,7 @@ GrB_Info GB_masker_phase1 // count nnz in each R(:,j) const int64_t *restrict R_to_Z, const bool Rp_is_32, const bool Rj_is_32, + const int R_sparsity, // GxB_SPARSE or GxB_HYPERSPARSE // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -60,6 +61,7 @@ GrB_Info GB_masker_phase1 // count nnz in each R(:,j) ASSERT (Rp_handle != NULL) ; ASSERT (Rp_size_handle != NULL) ; ASSERT (Rnvec_nonempty != NULL) ; + ASSERT (R_sparsity == GxB_SPARSE || R_sparsity == GxB_HYPERSPARSE) ; ASSERT_MATRIX_OK (M, "M for mask phase1", GB0) ; ASSERT (!GB_ZOMBIES (M)) ; @@ -109,7 +111,7 @@ GrB_Info GB_masker_phase1 // count nnz in each R(:,j) R_ntasks, // # of tasks R_nthreads, // # of threads to use // analysis from phase0: - Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32, + Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32, R_sparsity, // original input: M, Mask_comp, Mask_struct, C, Z) ; diff --git a/GraphBLAS/Source/mask/GB_masker_phase2.c b/GraphBLAS/Source/mask/GB_masker_phase2.c index f7b2e4b93e..52a01aaa6a 100644 --- a/GraphBLAS/Source/mask/GB_masker_phase2.c +++ b/GraphBLAS/Source/mask/GB_masker_phase2.c @@ -65,7 +65,7 @@ GrB_Info GB_masker_phase2 // phase2 for R = masker (C,M,Z) const bool Rp_is_32, const bool Rj_is_32, const bool Ri_is_32, - const int R_sparsity, + const int R_sparsity, // any sparsity format // original input: const GrB_Matrix M, // required mask const bool Mask_comp, // if true, then M is complemented @@ -243,7 +243,8 @@ GrB_Info GB_masker_phase2 // phase2 for R = masker (C,M,Z) //---------------------------------------------------------------------- info = GB_masker_phase2_jit (R, TaskList, R_ntasks, R_nthreads, - R_to_M, R_to_C, R_to_Z, M, Mask_comp, Mask_struct, C, Z, + R_to_M, R_to_C, R_to_Z, R_sparsity, + M, Mask_comp, Mask_struct, C, Z, C_ek_slicing, C_ntasks, C_nthreads, M_ek_slicing, M_ntasks, M_nthreads) ; diff --git a/GraphBLAS/Source/memory/GB_calloc_memory.c b/GraphBLAS/Source/memory/GB_calloc_memory.c index a97ddbe70b..b1978c78e6 100644 --- a/GraphBLAS/Source/memory/GB_calloc_memory.c +++ b/GraphBLAS/Source/memory/GB_calloc_memory.c @@ -38,6 +38,7 @@ static inline void *GB_calloc_helper { // clear the block of memory with a parallel memset int nthreads_max = GB_Context_nthreads_max ( ) ; + // FIXME for CUDA: need to know if this is on the GPU or CPU GB_memset (p, 0, (*size), nthreads_max) ; } diff --git a/GraphBLAS/Source/memory/GB_memcpy.c b/GraphBLAS/Source/memory/GB_memcpy.c index ff075f9579..38d69f2308 100644 --- a/GraphBLAS/Source/memory/GB_memcpy.c +++ b/GraphBLAS/Source/memory/GB_memcpy.c @@ -22,7 +22,7 @@ void GB_memcpy // parallel memcpy ) { - // Fixme for CUDA: do: + // FIXME for CUDA: do (cpu,gpu) <= (cpu,gpu) memcpy's // cpu <- cpu (already done below) // cpu <- gpu (effectively done below but could be better) // gpu <- cpu (need this) diff --git a/GraphBLAS/Source/memory/GB_memset.c b/GraphBLAS/Source/memory/GB_memset.c index 0fcd20e0b1..95af65c40b 100644 --- a/GraphBLAS/Source/memory/GB_memset.c +++ b/GraphBLAS/Source/memory/GB_memset.c @@ -9,6 +9,8 @@ // Note that this function uses its own hard-coded chunk size. +// FIXME for CUDA: use CUDA memset if the array is on the GPU + #include "GB.h" #define GB_MEM_CHUNK (1024*1024) diff --git a/GraphBLAS/Source/mxm/GB_AxB_dot.c b/GraphBLAS/Source/mxm/GB_AxB_dot.c index 8c9be144d5..23caedc760 100644 --- a/GraphBLAS/Source/mxm/GB_AxB_dot.c +++ b/GraphBLAS/Source/mxm/GB_AxB_dot.c @@ -200,7 +200,7 @@ GrB_Info GB_AxB_dot // dot product (multiple methods) info = GrB_NO_VALUE ; #if defined ( GRAPHBLAS_HAS_CUDA ) - if (!C_iso && // Fixme for CUDA, remove and create C iso on output + if (!C_iso && // FIXME for CUDA, remove and create C iso on output GB_cuda_AxB_dot3_branch (M, Mask_struct, A, B, semiring, flipxy)) { info = (GB_cuda_AxB_dot3 (C, M, Mask_struct, A, B, semiring, diff --git a/GraphBLAS/Source/pack_unpack/GxB_pack_HyperHash.c b/GraphBLAS/Source/pack_unpack/GxB_pack_HyperHash.c index 4a61ad5957..daa8847572 100644 --- a/GraphBLAS/Source/pack_unpack/GxB_pack_HyperHash.c +++ b/GraphBLAS/Source/pack_unpack/GxB_pack_HyperHash.c @@ -116,7 +116,7 @@ GrB_Info GxB_pack_HyperHash // move Y into A->Y (*Y) = NULL ; A->Y_shallow = false ; A->no_hyper_hash = false ; // A now has a hyper_hash matrix A->Y - ASSERT_MATRIX_OK (A, "A with new hyperhash", GB0) ; + ASSERT_MATRIX_OK (A, "A with new hyper_hash", GB0) ; return (GrB_SUCCESS) ; } diff --git a/GraphBLAS/Source/print/GB_matvec_check.c b/GraphBLAS/Source/print/GB_matvec_check.c index afe0f6ae2f..0b927c1753 100644 --- a/GraphBLAS/Source/print/GB_matvec_check.c +++ b/GraphBLAS/Source/print/GB_matvec_check.c @@ -112,13 +112,12 @@ GrB_Info GB_matvec_check // check a GraphBLAS matrix or vector GB_CHECK_MAGIC (A) ; - GB_Ap_DECLARE (Ap, const) ; GB_Ap_PTR (Ap, A) ; - GB_Ah_DECLARE (Ah, const) ; GB_Ah_PTR (Ah, A) ; - GB_Ai_DECLARE (Ai, const) ; GB_Ai_PTR (Ai, A) ; - GB_AYp_DECLARE (A_Yp, const) ; GB_AYp_PTR (A_Yp, A) ; - GB_AYi_DECLARE (A_Yi, const) ; GB_AYi_PTR (A_Yi, A) ; - GB_AYx_DECLARE (A_Yx, const) ; GB_AYx_PTR (A_Yx, A) ; - + GB_Ap_DECLARE (Ap, const) ; GB_Ap_PTR (Ap, A) ; + GB_Ah_DECLARE (Ah, const) ; GB_Ah_PTR (Ah, A) ; + GB_Ai_DECLARE (Ai, const) ; GB_Ai_PTR (Ai, A) ; + GB_MDECL (A_Yp, const, u) ; GB_GET_HYPER_PTR (A_Yp, A, p) ; + GB_MDECL (A_Yi, const, u) ; GB_GET_HYPER_PTR (A_Yi, A, i) ; + GB_MDECL (A_Yx, const, u) ; GB_GET_HYPER_PTR (A_Yx, A, x) ; const int8_t *restrict Ab = A->b ; //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Source/select/GB_select_bitmap.c b/GraphBLAS/Source/select/GB_select_bitmap.c index 0f38cff73f..dfd9f5dfdc 100644 --- a/GraphBLAS/Source/select/GB_select_bitmap.c +++ b/GraphBLAS/Source/select/GB_select_bitmap.c @@ -86,7 +86,7 @@ GrB_Info GB_select_bitmap else { // Cx [0:anz-1] = Ax [0:anz-1] - // Fixme for CUDA: do this on the GPU if appropriate + // FIXME for CUDA: do this on the GPU if appropriate GB_memcpy (C->x, A->x, anz * asize, nthreads) ; } diff --git a/GraphBLAS/Source/select/GB_selector.c b/GraphBLAS/Source/select/GB_selector.c index 4d86498cc9..f150985f26 100644 --- a/GraphBLAS/Source/select/GB_selector.c +++ b/GraphBLAS/Source/select/GB_selector.c @@ -193,7 +193,7 @@ GrB_Info GB_selector // the use_select_bitmap test above (the DIAG operator). The CUDA // select_sparse kernel will not work in this case, so make this go to // the CPU. - // Fixme CUDA: put the test of sparse(A) or hypersparse(A) in + // FIXME CUDA: put the test of sparse(A) or hypersparse(A) in // GB_cuda_select_branch. info = GB_cuda_select_sparse (C, C_iso, op, flipij, A, athunk, ythunk, Werk) ; diff --git a/GraphBLAS/Source/sort/GB_bitonic.c b/GraphBLAS/Source/sort/GB_bitonic.c new file mode 100644 index 0000000000..99c4599d63 --- /dev/null +++ b/GraphBLAS/Source/sort/GB_bitonic.c @@ -0,0 +1,80 @@ + +// References: +// https://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm +// https://sortingalgos.miraheze.org/wiki/Bitonic_Sort + +#include "sort/GB_sort.h" + +//------------------------------------------------------------------------------ +// GB_bitonic: bitonic sort +//------------------------------------------------------------------------------ + +GrB_Info GB_bitonic +( + int32_t *restrict A, // array of size n + int64_t n, // n does not need to be a power of 2 + int nthreads +) +{ + printf ("GB_bitonic: n %ld, nthreads %d\n", n, nthreads) ; + +// for CUDA variant: +// int tid = blockIdx.x * blockDim.x + threadIdx.x ; +// int nthreads = blockDim.x * gridDim.x ; +// int64_t oops = 0 ; + + int64_t Nhalf = n/2 ; + for (int64_t k = 2, stage = 1 ; k < 2*n ; k = k << 1, stage++) + { + bool dir = (((((n-1) >> stage) + 1) & 1) != 0) ; + for (int64_t j = k >> 1 ; j > 0 ; j = j >> 1) + { + uint64_t mask = j-1 ; + + // for CUDA variant: + // parallel loop for all threads in the threadblock: +// for (int64_t ipair = tid ; ipair < Nhalf ; ipair += nthreads) + + int64_t ipair ; + #pragma omp parallel for num_threads(nthreads) schedule(static) + for (ipair = 0 ; ipair < Nhalf ; ipair++) + { + // Consider the pair of entries A [ileft] and A [iright] where + // ileft < iright always holds. The ileft entry is obtained by + // inserting a 0-bit in ipair, where the lower bits of ipair + // (in the mask) are kept and the upper bits are shifted to the + // left by one. For example, when j = 8, mask = 7 = 0111 in + // binary, then ileft is obtained by shifting the upper bits + // (all but the lower 3) of ipair to the left by one bit + // position, inserting a zero bit. Thus, if ipair = 1101111 + // and j=8 then ileft = 1101o111 where o = 0 denotes the + // inserted bit in ileft. Then iright is obtained by setting + // the o bit to 1. Thus ileft < iright always holds. + int64_t ileft = ((ipair & ~mask) << 1) | (ipair & mask) ; + int64_t iright = ileft | j ; + // ensure that A [iright] is in the range A [0..n-1] + if (iright >= n) + { + /* oops++ ; */ + continue ; + } + // if desc is true, swap descending, else swap ascending + bool desc = (((ileft & k) != 0) == dir) ; + int aleft = A [ileft] ; + int aright = A [iright] ; + if (desc ? (aleft < aright) : (aleft > aright)) + { + // swap A [ileft] and A [iright] + A [ileft ] = aright ; + A [iright] = aleft ; + } + } + // for CUDA variant: + // syncthreads here + } + } + + // printf ("oops: %ld\n", oops) ; + return (GrB_SUCCESS) ; +} + diff --git a/GraphBLAS/Source/sort/GB_msort_1.c b/GraphBLAS/Source/sort/GB_msort_1.c index 292a0b28fb..26fc42b1ee 100644 --- a/GraphBLAS/Source/sort/GB_msort_1.c +++ b/GraphBLAS/Source/sort/GB_msort_1.c @@ -63,6 +63,15 @@ GrB_Info GB_msort_1 // sort array A of size 1-by-n //-------------------------------------------------------------------------- int nthreads = GB_nthreads (n, GB_MSORT_BASECASE, nthreads_max) ; + +#if 0 + // HACK: to test GB_bitonic: + if (A0_is_32) + { + return (GB_bitonic (A_0, n, nthreads)) ; + } +#endif + if (nthreads <= 1 || n <= GB_MSORT_BASECASE) { // sequential quicksort diff --git a/GraphBLAS/Source/sort/GB_sort.h b/GraphBLAS/Source/sort/GB_sort.h index ac480134de..f3cbcfa225 100644 --- a/GraphBLAS/Source/sort/GB_sort.h +++ b/GraphBLAS/Source/sort/GB_sort.h @@ -266,6 +266,17 @@ GrB_Info GB_msort_3 // sort array A of size 3-by-n int nthreads_max // max # of threads to use ) ; +//------------------------------------------------------------------------------ +// bitonic sort +//------------------------------------------------------------------------------ + +GrB_Info GB_bitonic +( + int32_t *restrict A, // array of size n + int64_t n, // n does not need to be a power of 2 + int nthreads +) ; + //------------------------------------------------------------------------------ // matrix sorting (for GxB_Matrix_sort and GxB_Vector_sort) //------------------------------------------------------------------------------ diff --git a/GraphBLAS/Source/wait/GB_wait.c b/GraphBLAS/Source/wait/GB_wait.c index 6cbfab9ac9..2f443022cb 100644 --- a/GraphBLAS/Source/wait/GB_wait.c +++ b/GraphBLAS/Source/wait/GB_wait.c @@ -339,7 +339,7 @@ GrB_Info GB_wait // finish all pending computations ASSERT_MATRIX_OK (S, "S after GB_wait:add", GB0) ; //-------------------------------------------------------------------------- - // check if the A->Y hyper-hash can be kept + // check if the A->Y hyper_hash can be kept //-------------------------------------------------------------------------- if (A->no_hyper_hash) diff --git a/GraphBLAS/Tcov/log_Dec3.txt b/GraphBLAS/Tcov/log_Dec3.txt new file mode 100644 index 0000000000..0a30fa9984 --- /dev/null +++ b/GraphBLAS/Tcov/log_Dec3.txt @@ -0,0 +1,335 @@ +03-Dec-2025 12:58:16 grbcov starting +total blocks: 19589 + +---------------------------------------------- [cover] +[malloc debugging turned on] +03-Dec 12:59:54 test303 0:(32,32,32) 0.3 sec 97: 19277 1.6% 297.1/s +03-Dec 12:59:54 test303 0:(32,32,32) 0.1 sec 3: 19273 1.6% 35.6/s +03-Dec 12:59:55 test303 0:(32,32,32) 0.1 sec 6: 19267 1.6% 72.7/s +03-Dec 12:59:55 test300 0:(32,32,32) 0.0 sec 134: 19131 2.3% 4780.4/s +03-Dec 13:00:00 test301 0:(32,32,32) 5.8 sec 309: 18822 3.9% 53.7/s +03-Dec 13:00:05 test301 0:(32,32,32) 4.6 sec 37: 18785 4.1% 8.0/s +03-Dec 13:00:05 test302 0:(32,32,32) 0.0 sec 47: 18738 4.3% 1796.6/s +03-Dec 13:00:05 test155 0:(32,32,32) 0.1 sec 207: 18531 5.4% 3698.2/s +03-Dec 13:00:05 test155 0:(32,32,32) 0.0 sec 33: 18498 5.6% 1424.4/s +03-Dec 13:00:05 test155 2:(32,64,32) 0.0 sec 6: 18492 5.6% 322.0/s +03-Dec 13:00:05 test155 2:(32,64,32) 0.0 sec : 18492 5.6% +03-Dec 13:00:05 test155 4:(64,32,32) 0.0 sec 2: 18490 5.6% 106.1/s +03-Dec 13:00:05 test155 4:(64,32,32) 0.0 sec : 18490 5.6% +[malloc debugging turned off] +03-Dec 13:00:05 test299 0:(32,32,32) 0.0 sec 18: 18472 5.7% 687.7/s +03-Dec 13:00:07 test298 0:(32,32,32) 1.7 sec 119: 18353 6.3% 68.3/s +03-Dec 13:00:07 test298 0:(32,32,32) 0.0 sec 37: 18316 6.5% 1948.1/s +03-Dec 13:00:07 test297 0:(32,32,32) 0.3 sec 129: 18187 7.2% 431.0/s +03-Dec 13:00:07 test295 0:(32,32,32) 0.0 sec 12: 18175 7.2% 440.6/s +03-Dec 13:00:07 test294 0:(32,32,32) 0.1 sec 15: 18160 7.3% 187.7/s +03-Dec 13:00:11 test293 0:(32,32,32) 3.6 sec 44: 18116 7.5% 12.3/s +03-Dec 13:00:11 test291 0:(32,32,32) 0.0 sec 12: 18104 7.6% 476.4/s +03-Dec 13:00:11 test291 1:(32,32,64) 0.0 sec 1: 18103 7.6% 3311.3/s +03-Dec 13:00:11 test291 2:(32,64,32) 0.0 sec 1: 18102 7.6% 4444.4/s +03-Dec 13:00:11 test291 4:(64,32,32) 0.0 sec 1: 18101 7.6% 4784.7/s +03-Dec 13:00:13 test290 0:(32,32,32) 1.6 sec 2: 18099 7.6% 1.2/s +03-Dec 13:00:13 test287 0:(32,32,32) 0.0 sec 32: 18067 7.8% 1053.4/s +03-Dec 13:00:13 test287 4:(64,32,32) 0.0 sec : 18067 7.8% +03-Dec 13:00:13 test286 0:(32,32,32) 0.8 sec 34: 18033 7.9% 45.2/s +03-Dec 13:00:13 test286 0:(32,32,32) 0.0 sec 9: 18024 8.0% 3250.3/s +03-Dec 13:00:14 test286 1:(32,32,64) 0.8 sec : 18024 8.0% +03-Dec 13:00:14 test286 1:(32,32,64) 0.0 sec 1: 18023 8.0% 998.0/s +03-Dec 13:00:15 test286 2:(32,64,32) 0.8 sec : 18023 8.0% +03-Dec 13:00:15 test286 2:(32,64,32) 0.0 sec : 18023 8.0% +03-Dec 13:00:16 test286 4:(64,32,32) 0.9 sec : 18023 8.0% +03-Dec 13:00:16 test286 4:(64,32,32) 0.0 sec : 18023 8.0% +03-Dec 13:00:17 test78 0:(32,32,32) 0.5 sec 18: 18005 8.1% 33.3/s +03-Dec 13:00:17 test78 0:(32,32,32) 0.0 sec : 18005 8.1% +03-Dec 13:00:17 test78 4:(64,32,32) 0.2 sec 1: 18004 8.1% 4.1/s +03-Dec 13:00:17 test78 4:(64,32,32) 0.0 sec : 18004 8.1% +03-Dec 13:00:18 test285 0:(32,32,32) 0.7 sec 33: 17971 8.3% 46.4/s +03-Dec 13:00:18 test285 0:(32,32,32) 0.0 sec 8: 17963 8.3% 1299.5/s +03-Dec 13:00:18 test247 0:(32,32,32) 0.2 sec 22: 17941 8.4% 109.1/s +03-Dec 13:00:18 test247 0:(32,32,32) 0.1 sec 5: 17936 8.4% 47.5/s +03-Dec 13:00:18 test109 0:(32,32,32) 0.2 sec 31: 17905 8.6% 184.0/s +03-Dec 13:00:18 test109 0:(32,32,32) 0.0 sec 2: 17903 8.6% 1693.5/s +03-Dec 13:00:18 test109 0:(32,32,32) 0.1 sec 21: 17882 8.7% 183.9/s +03-Dec 13:00:18 test109 0:(32,32,32) 0.0 sec 1: 17881 8.7% 1137.7/s +03-Dec 13:00:18 test138 0:(32,32,32) 0.1 sec 27: 17854 8.9% 199.3/s +03-Dec 13:00:18 test138 0:(32,32,32) 0.0 sec 1: 17853 8.9% 50.7/s +03-Dec 13:00:18 test172 0:(32,32,32) 0.1 sec 41: 17811 9.1% 642.7/s +03-Dec 13:00:18 test172 0:(32,32,32) 0.0 sec 4: 17807 9.1% 135.4/s +03-Dec 13:00:19 test174 0:(32,32,32) 0.2 sec 19: 17788 9.2% 92.6/s +03-Dec 13:00:19 test174 0:(32,32,32) 0.0 sec 6: 17782 9.2% 1139.2/s +03-Dec 13:00:19 test203 0:(32,32,32) 0.0 sec 14: 17768 9.3% 497.3/s +03-Dec 13:00:19 test213 0:(32,32,32) 0.1 sec 10: 17758 9.3% 96.1/s +03-Dec 13:00:19 test213 0:(32,32,32) 0.0 sec 1: 17757 9.4% 760.5/s +03-Dec 13:00:19 test216 0:(32,32,32) 0.0 sec 11: 17746 9.4% 347.7/s +03-Dec 13:00:19 test225 0:(32,32,32) 0.3 sec 23: 17723 9.5% 85.3/s +03-Dec 13:00:19 test225 0:(32,32,32) 0.1 sec 29: 17694 9.7% 574.3/s +03-Dec 13:00:19 test226 0:(32,32,32) 0.2 sec 11: 17683 9.7% 60.9/s +03-Dec 13:00:19 test226 0:(32,32,32) 0.0 sec 2: 17681 9.7% 451.0/s +03-Dec 13:00:19 test235 0:(32,32,32) 0.1 sec 19: 17662 9.8% 281.7/s +03-Dec 13:00:19 test235 0:(32,32,32) 0.0 sec 3: 17659 9.9% 302.0/s +03-Dec 13:00:19 test252 0:(32,32,32) 0.0 sec 15: 17644 9.9% 575.7/s +03-Dec 13:00:20 test253 0:(32,32,32) 0.1 sec 30: 17614 10.1% 513.4/s +03-Dec 13:00:20 test255 0:(32,32,32) 0.0 sec 8: 17606 10.1% 310.4/s +03-Dec 13:00:20 test257 0:(32,32,32) 0.5 sec 51: 17555 10.4% 101.2/s +03-Dec 13:00:20 test257 0:(32,32,32) 0.0 sec 2: 17553 10.4% 2314.8/s +03-Dec 13:00:20 test260 0:(32,32,32) 0.0 sec 3: 17550 10.4% 110.1/s +03-Dec 13:00:20 test261 0:(32,32,32) 0.0 sec 37: 17513 10.6% 1534.1/s +03-Dec 13:00:20 test262 0:(32,32,32) 0.0 sec 15: 17498 10.7% 618.7/s +03-Dec 13:00:20 test263 0:(32,32,32) 0.3 sec 12: 17486 10.7% 44.3/s +03-Dec 13:00:20 test263 0:(32,32,32) 0.0 sec 1: 17485 10.7% 660.9/s +03-Dec 13:00:20 test264 0:(32,32,32) 0.0 sec 146: 17339 11.5% 5611.1/s +03-Dec 13:00:21 test265 0:(32,32,32) 0.5 sec 11: 17328 11.5% 20.2/s +03-Dec 13:00:21 test265 0:(32,32,32) 0.0 sec 1: 17327 11.5% 944.3/s +03-Dec 13:00:21 test267 0:(32,32,32) 0.3 sec 16: 17311 11.6% 63.2/s +03-Dec 13:00:21 test269 0:(32,32,32) 0.0 sec 121: 17190 12.2% 3611.9/s +03-Dec 13:00:21 test271 0:(32,32,32) 0.0 sec 543: 16647 15.0% 11925.9/s +03-Dec 13:00:21 test272 0:(32,32,32) 0.0 sec 8: 16639 15.1% 302.2/s +03-Dec 13:00:21 test273 0:(32,32,32) 0.1 sec 122: 16517 15.7% 2091.9/s +03-Dec 13:00:22 test274 0:(32,32,32) 0.0 sec 119: 16398 16.3% 3592.3/s +03-Dec 13:00:22 test276 0:(32,32,32) 0.2 sec 1697: 14701 25.0% 6989.2/s +03-Dec 13:00:22 test277 0:(32,32,32) 0.0 sec 26: 14675 25.1% 1013.1/s +03-Dec 13:00:22 test279 0:(32,32,32) 0.0 sec 69: 14606 25.4% 2556.4/s +03-Dec 13:00:22 test281 0:(32,32,32) 0.0 sec 14: 14592 25.5% 529.8/s +03-Dec 13:00:22 test268 0:(32,32,32) 0.2 sec 4: 14588 25.5% 20.1/s +03-Dec 13:00:22 test268 0:(32,32,32) 0.0 sec 4: 14584 25.6% 650.1/s +03-Dec 13:00:22 test207 0:(32,32,32) 0.0 sec 11: 14573 25.6% 2383.5/s +03-Dec 13:00:22 test207 1:(32,32,64) 0.0 sec 1: 14572 25.6% 1136.4/s +03-Dec 13:00:23 test211 0:(32,32,32) 0.4 sec 14: 14558 25.7% 31.2/s +03-Dec 13:00:23 test211 0:(32,32,32) 0.0 sec 6: 14552 25.7% 1219.8/s +03-Dec 13:00:23 test183 0:(32,32,32) 0.1 sec 16: 14536 25.8% 108.1/s +03-Dec 13:00:23 test212 0:(32,32,32) 0.3 sec 9: 14527 25.8% 35.5/s +03-Dec 13:00:23 test212 0:(32,32,32) 0.0 sec 5: 14522 25.9% 2038.3/s +03-Dec 13:00:23 test219 0:(32,32,32) 0.0 sec 8: 14514 25.9% 279.1/s +03-Dec 13:00:23 test219 0:(32,32,32) 0.0 sec 4: 14510 25.9% 6369.4/s +[malloc debugging turned on] +03-Dec 13:00:23 test296 0:(32,32,32) 0.0 sec 23: 14487 26.0% 5738.5/s +03-Dec 13:00:23 test289 0:(32,32,32) 0.2 sec 74: 14413 26.4% 340.6/s +03-Dec 13:00:23 test288 0:(32,32,32) 0.0 sec 17: 14396 26.5% 644.4/s +03-Dec 13:00:23 test244 0:(32,32,32) 0.1 sec 23: 14373 26.6% 299.2/s +03-Dec 13:00:23 test244 1:(32,32,64) 0.0 sec 2: 14371 26.6% 67.7/s +03-Dec 13:00:23 test194 0:(32,32,32) 0.1 sec 24: 14347 26.8% 210.0/s +03-Dec 13:00:24 test09 0:(32,32,32) 0.3 sec 8: 14339 26.8% 29.0/s +03-Dec 13:00:24 test09 0:(32,32,32) 0.0 sec 4: 14335 26.8% 2059.7/s +03-Dec 13:00:24 test108 0:(32,32,32) 0.2 sec 51: 14284 27.1% 268.1/s +03-Dec 13:00:24 test108 0:(32,32,32) 0.0 sec 25: 14259 27.2% 654.3/s +03-Dec 13:00:24 test137 0:(32,32,32) 0.1 sec 13: 14246 27.3% 100.7/s +03-Dec 13:00:24 test137 0:(32,32,32) 0.0 sec 1: 14245 27.3% 338.4/s +03-Dec 13:00:24 test137 0:(32,32,32) 0.0 sec 1: 14244 27.3% 244.7/s +03-Dec 13:00:24 test124 0:(32,32,32) 0.2 sec 6: 14238 27.3% 31.3/s +03-Dec 13:00:29 test133 0:(32,32,32) 4.2 sec 10: 14228 27.4% 2.4/s +03-Dec 13:00:34 test133 0:(32,32,32) 5.0 sec 21: 14207 27.5% 4.2/s +03-Dec 13:00:35 test176 0:(32,32,32) 1.6 sec 25: 14182 27.6% 15.7/s +03-Dec 13:00:35 test176 0:(32,32,32) 0.1 sec 10: 14172 27.7% 132.9/s +03-Dec 13:00:35 test197 0:(32,32,32) 0.2 sec 12: 14160 27.7% 58.5/s +03-Dec 13:00:36 test197 0:(32,32,32) 0.2 sec 3: 14157 27.7% 15.4/s +03-Dec 13:00:36 test201 0:(32,32,32) 0.1 sec 23: 14134 27.8% 363.6/s +03-Dec 13:00:36 test208 0:(32,32,32) 0.1 sec 17: 14117 27.9% 252.7/s +03-Dec 13:00:36 test214 0:(32,32,32) 0.0 sec 39: 14078 28.1% 7656.1/s +03-Dec 13:00:36 test214 0:(32,32,32) 0.0 sec 5: 14073 28.2% 1702.4/s +03-Dec 13:00:36 test214 1:(32,32,64) 0.0 sec 1: 14072 28.2% 317.6/s +03-Dec 13:00:36 test214 1:(32,32,64) 0.0 sec : 14072 28.2% +03-Dec 13:00:36 test223 0:(32,32,32) 0.0 sec 11: 14061 28.2% 1262.0/s +03-Dec 13:00:36 test223 0:(32,32,32) 0.0 sec 1: 14060 28.2% 68.2/s +03-Dec 13:00:36 test241 0:(32,32,32) 0.1 sec 54: 14006 28.5% 784.0/s +03-Dec 13:00:36 test241 0:(32,32,32) 0.0 sec 8: 13998 28.5% 224.5/s +03-Dec 13:00:36 test270 0:(32,32,32) 0.0 sec 288: 13710 30.0% 8027.0/s +03-Dec 13:00:36 test199 0:(32,32,32) 0.0 sec 16: 13694 30.1% 4250.8/s +03-Dec 13:00:36 test210 0:(32,32,32) 0.1 sec : 13694 30.1% +03-Dec 13:00:36 test210 0:(32,32,32) 0.0 sec 3: 13691 30.1% 2765.0/s +03-Dec 13:00:36 test165 0:(32,32,32) 0.0 sec 13: 13678 30.2% 4701.6/s +03-Dec 13:00:36 test221 0:(32,32,32) 0.1 sec 6: 13672 30.2% 69.8/s +03-Dec 13:00:36 test221 0:(32,32,32) 0.0 sec 1: 13671 30.2% 624.2/s +03-Dec 13:00:36 test278 0:(32,32,32) 0.0 sec 53: 13618 30.5% 1847.8/s +03-Dec 13:00:36 test162 0:(32,32,32) 0.0 sec 20: 13598 30.6% 941.9/s +03-Dec 13:00:36 test162 0:(32,32,32) 0.0 sec 1: 13597 30.6% 40.4/s +03-Dec 13:00:36 test275 0:(32,32,32) 0.0 sec 27: 13570 30.7% 798.4/s +03-Dec 13:00:37 test220 0:(32,32,32) 0.2 sec 9: 13561 30.8% 54.7/s +03-Dec 13:00:37 test83 0:(32,32,32) 0.3 sec 7: 13554 30.8% 26.2/s +03-Dec 13:00:37 test83 0:(32,32,32) 0.0 sec 1: 13553 30.8% 1479.3/s +03-Dec 13:00:37 test04 0:(32,32,32) 0.6 sec 12: 13541 30.9% 20.1/s +03-Dec 13:00:37 test04 0:(32,32,32) 0.0 sec 8: 13533 30.9% 1220.8/s +03-Dec 13:00:37 test132 0:(32,32,32) 0.0 sec 9: 13524 31.0% 245.6/s +03-Dec 13:00:38 test82 0:(32,32,32) 0.4 sec 9: 13515 31.0% 23.1/s +03-Dec 13:00:38 test202 0:(32,32,32) 0.0 sec 71: 13444 31.4% 3970.9/s +03-Dec 13:00:38 test202 0:(32,32,32) 0.0 sec 2: 13442 31.4% 177.1/s +03-Dec 13:00:38 test202 0:(32,32,32) 0.0 sec 4: 13438 31.4% 372.4/s +03-Dec 13:00:38 test202 1:(32,32,64) 0.0 sec : 13438 31.4% +03-Dec 13:00:38 test202 1:(32,32,64) 0.0 sec 1: 13437 31.4% 86.5/s +03-Dec 13:00:38 test202 1:(32,32,64) 0.0 sec : 13437 31.4% +03-Dec 13:00:38 test202 2:(32,64,32) 0.0 sec 1: 13436 31.4% 105.8/s +03-Dec 13:00:38 test202 2:(32,64,32) 0.0 sec : 13436 31.4% +03-Dec 13:00:38 test202 2:(32,64,32) 0.0 sec : 13436 31.4% +03-Dec 13:00:38 test222 0:(32,32,32) 0.1 sec 40: 13396 31.6% 716.0/s +03-Dec 13:00:38 test204 0:(32,32,32) 0.0 sec 9: 13387 31.7% 356.8/s +03-Dec 13:00:38 test258 0:(32,32,32) 0.2 sec 13: 13374 31.7% 61.4/s +03-Dec 13:00:38 test258 0:(32,32,32) 0.0 sec 2: 13372 31.7% 1156.7/s +03-Dec 13:00:38 test258 1:(32,32,64) 0.1 sec 1: 13371 31.7% 18.4/s +03-Dec 13:00:38 test258 1:(32,32,64) 0.0 sec : 13371 31.7% +03-Dec 13:00:39 test136 0:(32,32,32) 1.0 sec 4: 13367 31.8% 3.8/s +03-Dec 13:00:39 test136 0:(32,32,32) 0.0 sec 53: 13314 32.0% 3349.3/s +03-Dec 13:00:40 test128 0:(32,32,32) 0.8 sec 52: 13262 32.3% 67.1/s +03-Dec 13:00:40 test128 0:(32,32,32) 0.1 sec 1: 13261 32.3% 8.9/s +03-Dec 13:00:41 test144 0:(32,32,32) 0.2 sec 5: 13256 32.3% 22.6/s +03-Dec 13:00:43 test81 0:(32,32,32) 2.3 sec 39: 13217 32.5% 16.6/s +[malloc debugging turned off] +03-Dec 13:00:43 testc2(0,0) 0:(32,32,32) 0.4 sec 172: 13045 33.4% 390.2/s +03-Dec 13:00:44 testc2(0,0) 1:(32,32,64) 0.3 sec 1: 13044 33.4% 3.6/s +03-Dec 13:00:44 test239 0:(32,32,32) 0.0 sec 10: 13034 33.5% 1143.1/s +03-Dec 13:00:44 test239 0:(32,32,32) 0.3 sec 1: 13033 33.5% 3.1/s +03-Dec 13:00:44 test245 0:(32,32,32) 0.3 sec 32: 13001 33.6% 93.5/s +03-Dec 13:00:44 test245 0:(32,32,32) 0.0 sec 8: 12993 33.7% 1889.9/s +03-Dec 13:00:45 test159 0:(32,32,32) 0.3 sec 39: 12954 33.9% 127.0/s +03-Dec 13:00:46 test259 0:(32,32,32) 1.0 sec 27: 12927 34.0% 28.3/s +03-Dec 13:00:46 test259 0:(32,32,32) 0.0 sec 1: 12926 34.0% 147.7/s +03-Dec 13:00:47 testc4(0) 0:(32,32,32) 1.1 sec 11: 12915 34.1% 10.3/s +03-Dec 13:00:47 test157 0:(32,32,32) 0.6 sec 42: 12873 34.3% 66.2/s +03-Dec 13:00:49 test182 0:(32,32,32) 1.9 sec 22: 12851 34.4% 11.9/s +03-Dec 13:00:49 test182 0:(32,32,32) 0.3 sec 13: 12838 34.5% 49.9/s +03-Dec 13:00:51 test195 0:(32,32,32) 1.9 sec 62: 12776 34.8% 32.7/s +03-Dec 13:00:55 test135 0:(32,32,32) 3.8 sec 43: 12733 35.0% 11.3/s +03-Dec 13:00:55 test215 0:(32,32,32) 0.1 sec 2: 12731 35.0% 23.9/s +03-Dec 13:00:57 test80 0:(32,32,32) 2.2 sec 11: 12720 35.1% 4.9/s +03-Dec 13:00:58 test200 0:(32,32,32) 0.7 sec 11: 12709 35.1% 15.8/s +03-Dec 13:01:00 test283 0:(32,32,32) 2.2 sec 94: 12615 35.6% 43.0/s +03-Dec 13:01:02 test283 1:(32,32,64) 1.5 sec 1: 12614 35.6% 0.7/s +03-Dec 13:01:02 test254 0:(32,32,32) 0.3 sec 25: 12589 35.7% 93.2/s +03-Dec 13:01:03 test254 0:(32,32,32) 1.1 sec 1: 12588 35.7% 0.9/s +03-Dec 13:01:06 test54 0:(32,32,32) 3.1 sec 26: 12561 35.9% 8.4/s +03-Dec 13:01:08 test54 0:(32,32,32) 1.3 sec 13: 12548 35.9% 10.2/s +03-Dec 13:01:10 testcc(1) 0:(32,32,32) 2.2 sec 10: 12538 36.0% 4.4/s +03-Dec 13:01:11 testcc(1) 0:(32,32,32) 1.4 sec 6: 12532 36.0% 4.3/s +03-Dec 13:01:12 testc2(1,1) 0:(32,32,32) 0.3 sec 11: 12521 36.1% 37.6/s +03-Dec 13:01:14 testc2(1,1) 0:(32,32,32) 2.7 sec 3: 12518 36.1% 1.1/s +03-Dec 13:01:16 test141 0:(32,32,32) 1.4 sec 521: 11997 38.8% 378.2/s +03-Dec 13:01:16 test179 0:(32,32,32) 0.0 sec 22: 11975 38.9% 481.3/s +03-Dec 13:01:16 test179 0:(32,32,32) 0.7 sec 10: 11965 38.9% 14.4/s +03-Dec 13:01:16 test188b 0:(32,32,32) 0.0 sec 39: 11926 39.1% 1009.2/s +03-Dec 13:01:17 test185 0:(32,32,32) 0.0 sec 23: 11903 39.2% 511.2/s +03-Dec 13:01:17 test256 0:(32,32,32) 0.2 sec 38: 11865 39.4% 174.0/s +03-Dec 13:01:17 test256 0:(32,32,32) 0.0 sec : 11865 39.4% +03-Dec 13:01:17 test256 1:(32,32,64) 0.2 sec 1: 11864 39.4% 5.3/s +03-Dec 13:01:17 test256 1:(32,32,64) 0.0 sec : 11864 39.4% +03-Dec 13:01:18 test238b 0:(32,32,32) 0.7 sec 31: 11833 39.6% 44.5/s +03-Dec 13:01:18 test238 0:(32,32,32) 0.5 sec 64: 11769 39.9% 131.7/s +03-Dec 13:01:19 test186 0:(32,32,32) 0.9 sec 25: 11744 40.0% 27.5/s +03-Dec 13:01:19 test186 0:(32,32,32) 0.1 sec : 11744 40.0% +03-Dec 13:01:19 test186 0:(32,32,32) 0.1 sec : 11744 40.0% +[malloc debugging turned on] +03-Dec 13:01:21 testca(1) 0:(32,32,32) 1.7 sec 38: 11706 40.2% 21.8/s +03-Dec 13:01:23 testca(1) 0:(32,32,32) 1.7 sec 2: 11704 40.3% 1.2/s +03-Dec 13:01:23 test148 0:(32,32,32) 0.5 sec 7: 11697 40.3% 14.2/s +03-Dec 13:01:23 test148 0:(32,32,32) 0.0 sec 4: 11693 40.3% 1497.6/s +03-Dec 13:01:25 test231 0:(32,32,32) 1.7 sec 385: 11308 42.3% 231.3/s +03-Dec 13:01:26 test129 0:(32,32,32) 0.7 sec 10: 11298 42.3% 15.1/s +03-Dec 13:01:31 test69 0:(32,32,32) 5.2 sec 31: 11267 42.5% 6.0/s +03-Dec 13:01:35 test69 0:(32,32,32) 4.0 sec 12: 11255 42.5% 3.0/s +03-Dec 13:01:37 test29 0:(32,32,32) 2.0 sec 151: 11104 43.3% 77.0/s +03-Dec 13:01:39 test29 0:(32,32,32) 1.9 sec 2: 11102 43.3% 1.1/s +03-Dec 13:01:40 test29 1:(32,32,64) 1.8 sec 2: 11100 43.3% 1.1/s +03-Dec 13:01:42 test29 1:(32,32,64) 1.8 sec : 11100 43.3% +03-Dec 13:01:43 test282 0:(32,32,32) 0.3 sec 15: 11085 43.4% 45.1/s +03-Dec 13:01:43 test249 0:(32,32,32) 0.4 sec 18: 11067 43.5% 44.6/s +03-Dec 13:01:43 test249 0:(32,32,32) 0.4 sec 1: 11066 43.5% 2.7/s +03-Dec 13:01:44 test196 0:(32,32,32) 0.5 sec 18: 11048 43.6% 33.6/s +03-Dec 13:01:44 test250 0:(32,32,32) 0.5 sec 69: 10979 44.0% 137.0/s +03-Dec 13:01:45 test250 0:(32,32,32) 0.6 sec 4: 10975 44.0% 6.2/s +03-Dec 13:01:46 test145 0:(32,32,32) 0.7 sec 22: 10953 44.1% 30.4/s +03-Dec 13:01:46 test145 0:(32,32,32) 0.0 sec 6: 10947 44.1% 1109.1/s +03-Dec 13:01:52 test229 0:(32,32,32) 6.6 sec 14: 10933 44.2% 2.1/s +03-Dec 13:01:54 test209 0:(32,32,32) 2.0 sec 43: 10890 44.4% 21.5/s +03-Dec 13:01:56 test209 1:(32,32,64) 2.0 sec 1: 10889 44.4% 0.5/s +03-Dec 13:01:58 test224 0:(32,32,32) 1.5 sec 60: 10829 44.7% 39.7/s +03-Dec 13:01:58 test191 0:(32,32,32) 0.4 sec 26: 10803 44.9% 61.5/s +03-Dec 13:01:58 test191 0:(32,32,32) 0.1 sec 2: 10801 44.9% 20.8/s +03-Dec 13:01:59 test150 0:(32,32,32) 0.0 sec 20: 10781 45.0% 553.7/s +03-Dec 13:01:59 test240 0:(32,32,32) 0.3 sec 26: 10755 45.1% 86.5/s +03-Dec 13:01:59 test240 0:(32,32,32) 0.4 sec 1: 10754 45.1% 2.8/s +03-Dec 13:02:00 test237 0:(32,32,32) 0.3 sec 10: 10744 45.2% 32.4/s +03-Dec 13:02:00 test237 0:(32,32,32) 0.0 sec 1: 10743 45.2% 103.3/s +03-Dec 13:02:00 test237 0:(32,32,32) 0.0 sec 1: 10742 45.2% 95.9/s +03-Dec 13:02:00 test237 0:(32,32,32) 0.0 sec 1: 10741 45.2% 110.8/s +03-Dec 13:02:01 test184 0:(32,32,32) 1.1 sec 7: 10734 45.2% 6.3/s +03-Dec 13:02:04 test236 0:(32,32,32) 3.4 sec 117: 10617 45.8% 33.9/s +[malloc debugging turned off] +03-Dec 13:02:15 test84 0:(32,32,32) 10.8 sec 15: 10602 45.9% 1.4/s +03-Dec 13:02:15 test84 0:(32,32,32) 0.4 sec 32: 10570 46.0% 76.7/s +03-Dec 13:02:25 test84 2:(32,64,32) 9.3 sec 1: 10569 46.0% 0.1/s +03-Dec 13:02:25 test84 2:(32,64,32) 0.4 sec : 10569 46.0% +03-Dec 13:02:26 test84 0:(32,32,32) 0.5 sec : 10569 46.0% +03-Dec 13:02:26 test84 0:(32,32,32) 0.5 sec 4: 10565 46.1% 8.3/s +03-Dec 13:02:26 test84 2:(32,64,32) 0.5 sec 1: 10564 46.1% 2.2/s +03-Dec 13:02:27 test84 2:(32,64,32) 0.5 sec : 10564 46.1% +03-Dec 13:02:45 test173 0:(32,32,32) 17.6 sec 20: 10544 46.2% 1.1/s +03-Dec 13:02:45 test173 0:(32,32,32) 0.4 sec 4: 10540 46.2% 9.9/s +03-Dec 13:02:57 test230 0:(32,32,32) 11.8 sec 250: 10290 47.5% 21.3/s +03-Dec 13:02:58 test230 0:(32,32,32) 1.7 sec 2: 10288 47.5% 1.2/s +03-Dec 13:03:10 test18 0:(32,32,32) 12.0 sec 91: 10197 47.9% 7.6/s +03-Dec 13:03:13 test18 0:(32,32,32) 2.9 sec 7: 10190 48.0% 2.4/s +03-Dec 13:03:55 testc7(0) 0:(32,32,32) 41.5 sec 12: 10178 48.0% 0.3/s +03-Dec 13:04:01 testc7(0) 0:(32,32,32) 6.1 sec 11: 10167 48.1% 1.8/s +03-Dec 13:04:27 test193 0:(32,32,32) 25.9 sec 200: 9967 49.1% 7.7/s +03-Dec 13:04:30 test127 0:(32,32,32) 2.8 sec 929: 9038 53.9% 335.7/s +03-Dec 13:04:34 test23 0:(32,32,32) 4.5 sec 61: 8977 54.2% 13.7/s +03-Dec 13:04:41 test243 0:(32,32,32) 6.5 sec 7: 8970 54.2% 1.1/s +03-Dec 13:05:43 test53 0:(32,32,32) 61.8 sec 38: 8932 54.4% 0.6/s +03-Dec 13:05:47 test53 0:(32,32,32) 4.1 sec 5: 8927 54.4% 1.2/s +03-Dec 13:05:59 test242 0:(32,32,32) 12.4 sec 45: 8882 54.7% 3.6/s +03-Dec 13:06:10 test17 0:(32,32,32) 11.2 sec 32: 8850 54.8% 2.9/s +03-Dec 13:06:20 test246 0:(32,32,32) 9.8 sec 5: 8845 54.8% 0.5/s +03-Dec 13:06:25 test251b 0:(32,32,32) 5.3 sec 26: 8819 55.0% 4.9/s +03-Dec 13:06:36 test251 0:(32,32,32) 10.9 sec 100: 8719 55.5% 9.1/s +03-Dec 13:06:49 test152 0:(32,32,32) 12.5 sec 190: 8529 56.5% 15.2/s +03-Dec 13:07:03 test152 0:(32,32,32) 13.9 sec 123: 8406 57.1% 8.9/s +03-Dec 13:07:05 test160 0:(32,32,32) 2.6 sec 17: 8389 57.2% 6.6/s +03-Dec 13:07:41 test232 0:(32,32,32) 35.5 sec 58: 8331 57.5% 1.6/s +03-Dec 13:07:41 test232 0:(32,32,32) 0.5 sec 5: 8326 57.5% 10.5/s +03-Dec 13:07:43 test142b 0:(32,32,32) 1.7 sec 10: 8316 57.5% 6.0/s +03-Dec 13:07:43 test142b 0:(32,32,32) 0.0 sec 3: 8313 57.6% 147.7/s +03-Dec 13:08:36 test142 0:(32,32,32) 53.2 sec 304: 8009 59.1% 5.7/s +03-Dec 13:09:46 test227 0:(32,32,32) 69.9 sec 15: 7994 59.2% 0.2/s +03-Dec 13:09:55 test292 0:(32,32,32) 8.2 sec 1: 7993 59.2% 0.1/s +03-Dec 13:09:56 test192 0:(32,32,32) 1.2 sec 3: 7990 59.2% 2.6/s +03-Dec 13:10:05 test181 0:(32,32,32) 8.8 sec 4: 7986 59.2% 0.5/s +03-Dec 13:10:09 test181 0:(32,32,32) 4.7 sec 11: 7975 59.3% 2.3/s +[malloc debugging turned on] +03-Dec 13:10:27 test130 0:(32,32,32) 17.3 sec 8: 7967 59.3% 0.5/s +03-Dec 13:10:27 test130 0:(32,32,32) 0.4 sec 5: 7962 59.4% 12.8/s +03-Dec 13:10:28 test206 0:(32,32,32) 0.7 sec 86: 7876 59.8% 124.6/s +03-Dec 13:10:33 test206 0:(32,32,32) 4.9 sec 12: 7864 59.9% 2.4/s +03-Dec 13:10:48 test02 0:(32,32,32) 15.5 sec 3: 7861 59.9% 0.2/s +03-Dec 13:11:40 test11 0:(32,32,32) 51.3 sec 17: 7844 60.0% 0.3/s +03-Dec 13:11:46 test187 0:(32,32,32) 5.5 sec 5: 7839 60.0% 0.9/s +03-Dec 13:11:48 test187 0:(32,32,32) 2.2 sec 1: 7838 60.0% 0.5/s +03-Dec 13:11:50 test169 0:(32,32,32) 2.7 sec 32: 7806 60.2% 12.0/s +03-Dec 13:11:54 test76 0:(32,32,32) 4.0 sec 14: 7792 60.2% 3.5/s +03-Dec 13:12:03 test01 0:(32,32,32) 8.4 sec 591: 7201 63.2% 70.0/s +03-Dec 13:12:08 test01 0:(32,32,32) 4.9 sec 4: 7197 63.3% 0.8/s +03-Dec 13:12:12 test228 0:(32,32,32) 4.1 sec 25: 7172 63.4% 6.1/s +03-Dec 13:12:17 test104 0:(32,32,32) 5.2 sec 35: 7137 63.6% 6.7/s +03-Dec 13:13:20 test284 0:(32,32,32) 63.3 sec 68: 7069 63.9% 1.1/s +03-Dec 13:13:23 test284 0:(32,32,32) 2.1 sec 4: 7065 63.9% 1.9/s +03-Dec 13:13:44 test180 0:(32,32,32) 21.2 sec 21: 7044 64.0% 1.0/s +03-Dec 13:13:51 test180 0:(32,32,32) 7.1 sec 91: 6953 64.5% 12.8/s +03-Dec 13:14:00 test188 0:(32,32,32) 8.6 sec 169: 6784 65.4% 19.7/s +03-Dec 13:14:29 test151b 0:(32,32,32) 28.9 sec 34: 6750 65.5% 1.2/s +03-Dec 13:14:29 test151b 0:(32,32,32) 0.2 sec 18: 6732 65.6% 81.2/s +03-Dec 13:14:52 test14b 0:(32,32,32) 22.9 sec 95: 6637 66.1% 4.2/s +03-Dec 13:15:19 test14 0:(32,32,32) 27.1 sec 257: 6380 67.4% 9.5/s +[malloc debugging turned off] +03-Dec 13:15:50 test125 0:(32,32,32) 31.2 sec 319: 6061 69.1% 10.2/s +03-Dec 13:17:02 test10 0:(32,32,32) 72.0 sec 702: 5359 72.6% 9.8/s +03-Dec 13:17:39 test75b 0:(32,32,32) 37.0 sec 293: 5066 74.1% 7.9/s +03-Dec 13:18:09 test74 0:(32,32,32) 29.9 sec 4101: 965 95.1% 137.4/s +03-Dec 13:20:40 test234 0:(32,32,32) 150.9 sec 198: 767 96.1% 1.3/s +[malloc debugging turned on] +03-Dec 13:20:40 test154b 0:(32,32,32) 0.0 sec 12: 755 96.1% 506.2/s +03-Dec 13:22:44 test154 0:(32,32,32) 124.2 sec 657: 98 99.5% 5.3/s +03-Dec 13:23:58 test21b 0:(32,32,32) 73.1 sec 64: 34 99.8% 0.9/s +03-Dec 13:28:08 test19b 0:(32,32,32) 249.9 sec 20: 14 99.9% 0.1/s +03-Dec 13:28:10 test19b 0:(32,32,32) 1.6 sec 4: 10 99.9% 2.5/s +[malloc debugging turned off] +03-Dec 13:30:46 test19 0:(32,32,32) 155.8 sec 7: 3 99.9% 0.0/s +03-Dec 13:30:47 test19 0:(32,32,32) 1.0 sec 2: 1 99.9% 2.1/s +03-Dec 13:30:47 test280(0) 0:(32,32,32) 0.0 sec 1: all 100% 20.3/s +[malloc debugging turned off] +03-Dec-2025 13:30:47 grbcov ending diff --git a/GraphBLAS/Tcov/log_Nov5.txt b/GraphBLAS/Tcov/log_Nov5.txt new file mode 100644 index 0000000000..11a4cabb95 --- /dev/null +++ b/GraphBLAS/Tcov/log_Nov5.txt @@ -0,0 +1,332 @@ +05-Nov-2025 15:02:41 grbcov starting +total blocks: 19591 + +---------------------------------------------- [cover] +[malloc debugging turned on] +05-Nov 15:04:18 test300 0:(32,32,32) 0.0 sec 139: 19236 1.8% 9610.1/s +05-Nov 15:04:24 test301 0:(32,32,32) 5.8 sec 370: 18866 3.7% 63.3/s +05-Nov 15:04:29 test301 0:(32,32,32) 4.8 sec 38: 18828 3.9% 7.9/s +05-Nov 15:04:29 test302 0:(32,32,32) 0.0 sec 47: 18781 4.1% 3032.5/s +05-Nov 15:04:29 test155 0:(32,32,32) 0.0 sec 227: 18554 5.3% 5462.9/s +05-Nov 15:04:29 test155 0:(32,32,32) 0.0 sec 33: 18521 5.5% 1378.2/s +05-Nov 15:04:29 test155 2:(32,64,32) 0.0 sec 9: 18512 5.5% 460.1/s +05-Nov 15:04:29 test155 2:(32,64,32) 0.0 sec : 18512 5.5% +05-Nov 15:04:29 test155 4:(64,32,32) 0.0 sec 3: 18509 5.5% 156.6/s +05-Nov 15:04:29 test155 4:(64,32,32) 0.0 sec : 18509 5.5% +[malloc debugging turned off] +05-Nov 15:04:29 test299 0:(32,32,32) 0.0 sec 13: 18496 5.6% 816.5/s +05-Nov 15:04:31 test298 0:(32,32,32) 1.6 sec 142: 18354 6.3% 86.7/s +05-Nov 15:04:31 test298 0:(32,32,32) 0.0 sec 38: 18316 6.5% 1968.5/s +05-Nov 15:04:31 test297 0:(32,32,32) 0.3 sec 129: 18187 7.2% 452.8/s +05-Nov 15:04:31 test295 0:(32,32,32) 0.0 sec 12: 18173 7.2% 752.1/s +05-Nov 15:04:31 test294 0:(32,32,32) 0.1 sec 14: 18159 7.3% 190.0/s +05-Nov 15:04:34 test293 0:(32,32,32) 3.2 sec 44: 18115 7.5% 13.8/s +05-Nov 15:04:34 test291 0:(32,32,32) 0.0 sec 12: 18103 7.6% 767.9/s +05-Nov 15:04:34 test291 1:(32,32,64) 0.0 sec 1: 18102 7.6% 3225.8/s +05-Nov 15:04:34 test291 2:(32,64,32) 0.0 sec 1: 18101 7.6% 3773.6/s +05-Nov 15:04:34 test291 4:(64,32,32) 0.0 sec 1: 18100 7.6% 3703.7/s +05-Nov 15:04:36 test290 0:(32,32,32) 1.7 sec 2: 18098 7.6% 1.2/s +05-Nov 15:04:36 test287 0:(32,32,32) 0.0 sec 29: 18069 7.8% 1400.2/s +05-Nov 15:04:36 test287 4:(64,32,32) 0.0 sec : 18069 7.8% +05-Nov 15:04:37 test286 0:(32,32,32) 0.8 sec 40: 18029 8.0% 49.4/s +05-Nov 15:04:37 test286 0:(32,32,32) 0.0 sec 9: 18020 8.0% 2328.0/s +05-Nov 15:04:38 test286 1:(32,32,64) 0.9 sec : 18020 8.0% +05-Nov 15:04:38 test286 1:(32,32,64) 0.0 sec 1: 18019 8.0% 854.7/s +05-Nov 15:04:39 test286 2:(32,64,32) 0.8 sec : 18019 8.0% +05-Nov 15:04:39 test286 2:(32,64,32) 0.0 sec : 18019 8.0% +05-Nov 15:04:40 test286 4:(64,32,32) 0.9 sec : 18019 8.0% +05-Nov 15:04:40 test286 4:(64,32,32) 0.0 sec : 18019 8.0% +05-Nov 15:04:40 test78 0:(32,32,32) 0.5 sec 23: 17996 8.1% 42.0/s +05-Nov 15:04:40 test78 0:(32,32,32) 0.0 sec : 17996 8.1% +05-Nov 15:04:40 test78 4:(64,32,32) 0.2 sec 1: 17995 8.1% 4.4/s +05-Nov 15:04:40 test78 4:(64,32,32) 0.0 sec : 17995 8.1% +05-Nov 15:04:41 test285 0:(32,32,32) 0.7 sec 33: 17962 8.3% 47.0/s +05-Nov 15:04:41 test285 0:(32,32,32) 0.0 sec 8: 17954 8.4% 1215.1/s +05-Nov 15:04:41 test247 0:(32,32,32) 0.1 sec 22: 17932 8.5% 159.9/s +05-Nov 15:04:41 test247 0:(32,32,32) 0.1 sec 5: 17927 8.5% 49.9/s +05-Nov 15:04:42 test109 0:(32,32,32) 0.2 sec 31: 17896 8.7% 193.4/s +05-Nov 15:04:42 test109 0:(32,32,32) 0.0 sec 2: 17894 8.7% 1537.3/s +05-Nov 15:04:42 test109 0:(32,32,32) 0.1 sec 21: 17873 8.8% 177.1/s +05-Nov 15:04:42 test109 0:(32,32,32) 0.0 sec 1: 17872 8.8% 1023.5/s +05-Nov 15:04:42 test138 0:(32,32,32) 0.1 sec 27: 17845 8.9% 199.9/s +05-Nov 15:04:42 test138 0:(32,32,32) 0.0 sec 1: 17844 8.9% 45.0/s +05-Nov 15:04:42 test172 0:(32,32,32) 0.1 sec 41: 17802 9.1% 774.5/s +05-Nov 15:04:42 test172 0:(32,32,32) 0.0 sec 4: 17798 9.2% 119.2/s +05-Nov 15:04:42 test174 0:(32,32,32) 0.2 sec 19: 17779 9.2% 91.3/s +05-Nov 15:04:42 test174 0:(32,32,32) 0.0 sec 6: 17773 9.3% 1174.6/s +05-Nov 15:04:42 test203 0:(32,32,32) 0.0 sec 14: 17759 9.4% 747.5/s +05-Nov 15:04:42 test213 0:(32,32,32) 0.1 sec 10: 17749 9.4% 93.5/s +05-Nov 15:04:42 test213 0:(32,32,32) 0.0 sec 1: 17748 9.4% 723.6/s +05-Nov 15:04:42 test216 0:(32,32,32) 0.0 sec 11: 17737 9.5% 470.6/s +05-Nov 15:04:43 test225 0:(32,32,32) 0.3 sec 23: 17714 9.6% 85.7/s +05-Nov 15:04:43 test225 0:(32,32,32) 0.1 sec 29: 17685 9.7% 559.9/s +05-Nov 15:04:43 test226 0:(32,32,32) 0.2 sec 11: 17674 9.8% 71.0/s +05-Nov 15:04:43 test226 0:(32,32,32) 0.0 sec 2: 17672 9.8% 418.3/s +05-Nov 15:04:43 test235 0:(32,32,32) 0.0 sec 19: 17653 9.9% 393.4/s +05-Nov 15:04:43 test235 0:(32,32,32) 0.0 sec 3: 17650 9.9% 263.8/s +05-Nov 15:04:43 test252 0:(32,32,32) 0.0 sec 15: 17635 10.0% 951.2/s +05-Nov 15:04:43 test253 0:(32,32,32) 0.0 sec 30: 17605 10.1% 708.8/s +05-Nov 15:04:43 test255 0:(32,32,32) 0.0 sec 8: 17597 10.2% 471.6/s +05-Nov 15:04:44 test257 0:(32,32,32) 0.5 sec 51: 17546 10.4% 103.2/s +05-Nov 15:04:44 test257 0:(32,32,32) 0.0 sec 2: 17544 10.4% 2617.8/s +05-Nov 15:04:44 test260 0:(32,32,32) 0.0 sec 3: 17541 10.5% 199.1/s +05-Nov 15:04:44 test261 0:(32,32,32) 0.0 sec 37: 17504 10.7% 2386.0/s +05-Nov 15:04:44 test262 0:(32,32,32) 0.0 sec 15: 17489 10.7% 1039.7/s +05-Nov 15:04:44 test263 0:(32,32,32) 0.3 sec 12: 17477 10.8% 46.8/s +05-Nov 15:04:44 test263 0:(32,32,32) 0.0 sec 1: 17476 10.8% 590.3/s +05-Nov 15:04:44 test264 0:(32,32,32) 0.0 sec 146: 17330 11.5% 9213.1/s +05-Nov 15:04:44 test265 0:(32,32,32) 0.5 sec 11: 17319 11.6% 22.1/s +05-Nov 15:04:44 test265 0:(32,32,32) 0.0 sec 1: 17318 11.6% 1200.5/s +05-Nov 15:04:45 test267 0:(32,32,32) 0.2 sec 16: 17302 11.7% 84.9/s +05-Nov 15:04:45 test269 0:(32,32,32) 0.0 sec 120: 17182 12.3% 6208.0/s +05-Nov 15:04:45 test271 0:(32,32,32) 0.0 sec 539: 16643 15.0% 16773.5/s +05-Nov 15:04:45 test272 0:(32,32,32) 0.0 sec 8: 16635 15.1% 521.6/s +05-Nov 15:04:45 test273 0:(32,32,32) 0.0 sec 122: 16513 15.7% 2473.3/s +05-Nov 15:04:45 test274 0:(32,32,32) 0.0 sec 119: 16394 16.3% 6064.0/s +05-Nov 15:04:45 test276 0:(32,32,32) 0.2 sec 1697: 14697 25.0% 7648.5/s +05-Nov 15:04:45 test277 0:(32,32,32) 0.0 sec 26: 14671 25.1% 1663.4/s +05-Nov 15:04:45 test279 0:(32,32,32) 0.0 sec 69: 14602 25.5% 3642.9/s +05-Nov 15:04:45 test281 0:(32,32,32) 0.0 sec 14: 14588 25.5% 862.8/s +05-Nov 15:04:45 test268 0:(32,32,32) 0.2 sec 4: 14584 25.6% 20.1/s +05-Nov 15:04:45 test268 0:(32,32,32) 0.0 sec 4: 14580 25.6% 807.3/s +05-Nov 15:04:45 test207 0:(32,32,32) 0.0 sec 7: 14573 25.6% 3634.5/s +05-Nov 15:04:45 test207 1:(32,32,64) 0.0 sec 1: 14572 25.6% 1477.1/s +05-Nov 15:04:46 test211 0:(32,32,32) 0.4 sec 14: 14558 25.7% 33.9/s +05-Nov 15:04:46 test211 0:(32,32,32) 0.0 sec 6: 14552 25.7% 935.9/s +05-Nov 15:04:46 test183 0:(32,32,32) 0.2 sec 16: 14536 25.8% 102.6/s +05-Nov 15:04:46 test212 0:(32,32,32) 0.2 sec 9: 14527 25.8% 36.1/s +05-Nov 15:04:46 test212 0:(32,32,32) 0.0 sec 5: 14522 25.9% 1748.9/s +05-Nov 15:04:46 test219 0:(32,32,32) 0.0 sec 8: 14514 25.9% 467.6/s +05-Nov 15:04:46 test219 0:(32,32,32) 0.0 sec 4: 14510 25.9% 5442.2/s +[malloc debugging turned on] +05-Nov 15:04:46 test296 0:(32,32,32) 0.0 sec 23: 14487 26.1% 8505.9/s +05-Nov 15:04:46 test289 0:(32,32,32) 0.2 sec 76: 14411 26.4% 353.8/s +05-Nov 15:04:46 test288 0:(32,32,32) 0.0 sec 20: 14391 26.5% 1078.4/s +05-Nov 15:04:46 test244 0:(32,32,32) 0.1 sec 23: 14368 26.7% 379.5/s +05-Nov 15:04:46 test244 1:(32,32,64) 0.0 sec 2: 14366 26.7% 63.2/s +05-Nov 15:04:47 test194 0:(32,32,32) 0.1 sec 24: 14342 26.8% 243.9/s +05-Nov 15:04:47 test09 0:(32,32,32) 0.2 sec 9: 14333 26.8% 37.8/s +05-Nov 15:04:47 test09 0:(32,32,32) 0.0 sec 4: 14329 26.9% 2326.9/s +05-Nov 15:04:47 test108 0:(32,32,32) 0.2 sec 51: 14278 27.1% 297.1/s +05-Nov 15:04:47 test108 0:(32,32,32) 0.0 sec 25: 14253 27.2% 628.6/s +05-Nov 15:04:47 test137 0:(32,32,32) 0.1 sec 13: 14240 27.3% 118.9/s +05-Nov 15:04:47 test137 0:(32,32,32) 0.0 sec 1: 14239 27.3% 367.8/s +05-Nov 15:04:47 test137 0:(32,32,32) 0.0 sec 1: 14238 27.3% 267.1/s +05-Nov 15:04:47 test124 0:(32,32,32) 0.1 sec 6: 14232 27.4% 40.3/s +05-Nov 15:04:51 test133 0:(32,32,32) 4.1 sec 10: 14222 27.4% 2.5/s +05-Nov 15:04:56 test133 0:(32,32,32) 4.5 sec 21: 14201 27.5% 4.6/s +05-Nov 15:04:58 test176 0:(32,32,32) 1.6 sec 26: 14175 27.6% 16.0/s +05-Nov 15:04:58 test176 0:(32,32,32) 0.1 sec 10: 14165 27.7% 153.1/s +05-Nov 15:04:58 test197 0:(32,32,32) 0.2 sec 12: 14153 27.8% 60.9/s +05-Nov 15:04:58 test197 0:(32,32,32) 0.2 sec 3: 14150 27.8% 15.0/s +05-Nov 15:04:58 test201 0:(32,32,32) 0.0 sec 23: 14127 27.9% 686.1/s +05-Nov 15:04:58 test208 0:(32,32,32) 0.0 sec 16: 14111 28.0% 350.0/s +05-Nov 15:04:58 test214 0:(32,32,32) 0.0 sec 39: 14072 28.2% 10918.3/s +05-Nov 15:04:58 test214 0:(32,32,32) 0.0 sec 5: 14067 28.2% 1812.3/s +05-Nov 15:04:58 test214 1:(32,32,64) 0.0 sec 1: 14066 28.2% 355.5/s +05-Nov 15:04:58 test214 1:(32,32,64) 0.0 sec : 14066 28.2% +05-Nov 15:04:58 test223 0:(32,32,32) 0.0 sec 11: 14055 28.3% 1320.7/s +05-Nov 15:04:58 test223 0:(32,32,32) 0.0 sec 1: 14054 28.3% 59.6/s +05-Nov 15:04:58 test241 0:(32,32,32) 0.1 sec 54: 14000 28.5% 992.8/s +05-Nov 15:04:58 test241 0:(32,32,32) 0.0 sec 8: 13992 28.6% 229.4/s +05-Nov 15:04:58 test270 0:(32,32,32) 0.0 sec 288: 13704 30.0% 13072.5/s +05-Nov 15:04:58 test199 0:(32,32,32) 0.0 sec 16: 13688 30.1% 5873.7/s +05-Nov 15:04:58 test210 0:(32,32,32) 0.1 sec : 13688 30.1% +05-Nov 15:04:58 test210 0:(32,32,32) 0.0 sec 3: 13685 30.1% 3456.2/s +05-Nov 15:04:58 test165 0:(32,32,32) 0.0 sec 13: 13672 30.2% 6238.0/s +05-Nov 15:04:59 test221 0:(32,32,32) 0.1 sec 6: 13666 30.2% 69.1/s +05-Nov 15:04:59 test221 0:(32,32,32) 0.0 sec 1: 13665 30.2% 759.3/s +05-Nov 15:04:59 test278 0:(32,32,32) 0.0 sec 53: 13612 30.5% 3070.7/s +05-Nov 15:04:59 test162 0:(32,32,32) 0.0 sec 20: 13592 30.6% 1079.9/s +05-Nov 15:04:59 test162 0:(32,32,32) 0.0 sec 1: 13591 30.6% 37.0/s +05-Nov 15:04:59 test275 0:(32,32,32) 0.0 sec 27: 13564 30.8% 1074.4/s +05-Nov 15:04:59 test220 0:(32,32,32) 0.2 sec 9: 13555 30.8% 47.6/s +05-Nov 15:04:59 test83 0:(32,32,32) 0.3 sec 7: 13548 30.8% 26.7/s +05-Nov 15:04:59 test83 0:(32,32,32) 0.0 sec 1: 13547 30.9% 1040.6/s +05-Nov 15:05:00 test04 0:(32,32,32) 0.6 sec 12: 13535 30.9% 19.1/s +05-Nov 15:05:00 test04 0:(32,32,32) 0.0 sec 8: 13527 31.0% 1208.6/s +05-Nov 15:05:00 test132 0:(32,32,32) 0.0 sec 9: 13518 31.0% 351.2/s +05-Nov 15:05:00 test82 0:(32,32,32) 0.4 sec 13: 13505 31.1% 35.9/s +05-Nov 15:05:00 test202 0:(32,32,32) 0.0 sec 71: 13434 31.4% 4118.6/s +05-Nov 15:05:00 test202 0:(32,32,32) 0.0 sec 2: 13432 31.4% 169.5/s +05-Nov 15:05:00 test202 0:(32,32,32) 0.0 sec 4: 13428 31.5% 365.8/s +05-Nov 15:05:00 test202 1:(32,32,64) 0.0 sec : 13428 31.5% +05-Nov 15:05:00 test202 1:(32,32,64) 0.0 sec 1: 13427 31.5% 83.3/s +05-Nov 15:05:00 test202 1:(32,32,64) 0.0 sec : 13427 31.5% +05-Nov 15:05:00 test202 2:(32,64,32) 0.0 sec 1: 13426 31.5% 103.2/s +05-Nov 15:05:00 test202 2:(32,64,32) 0.0 sec : 13426 31.5% +05-Nov 15:05:00 test202 2:(32,64,32) 0.0 sec : 13426 31.5% +05-Nov 15:05:00 test222 0:(32,32,32) 0.0 sec 40: 13386 31.7% 1031.8/s +05-Nov 15:05:00 test204 0:(32,32,32) 0.0 sec 9: 13377 31.7% 542.8/s +05-Nov 15:05:01 test258 0:(32,32,32) 0.2 sec 13: 13364 31.8% 62.2/s +05-Nov 15:05:01 test258 0:(32,32,32) 0.0 sec 2: 13362 31.8% 834.0/s +05-Nov 15:05:01 test258 1:(32,32,64) 0.1 sec 1: 13361 31.8% 17.3/s +05-Nov 15:05:01 test258 1:(32,32,64) 0.0 sec : 13361 31.8% +05-Nov 15:05:02 test136 0:(32,32,32) 1.0 sec 7: 13354 31.8% 7.1/s +05-Nov 15:05:02 test136 0:(32,32,32) 0.0 sec 53: 13301 32.1% 2996.6/s +05-Nov 15:05:02 test128 0:(32,32,32) 0.8 sec 52: 13249 32.4% 65.7/s +05-Nov 15:05:03 test128 0:(32,32,32) 0.1 sec 1: 13248 32.4% 8.5/s +05-Nov 15:05:03 test144 0:(32,32,32) 0.2 sec 5: 13243 32.4% 23.0/s +05-Nov 15:05:05 test81 0:(32,32,32) 2.3 sec 39: 13204 32.6% 16.7/s +[malloc debugging turned off] +05-Nov 15:05:06 testc2(0,0) 0:(32,32,32) 0.4 sec 172: 13032 33.5% 454.9/s +05-Nov 15:05:06 testc2(0,0) 1:(32,32,64) 0.3 sec 1: 13031 33.5% 3.6/s +05-Nov 15:05:06 test239 0:(32,32,32) 0.0 sec 10: 13021 33.5% 1207.1/s +05-Nov 15:05:06 test239 0:(32,32,32) 0.3 sec 1: 13020 33.5% 2.9/s +05-Nov 15:05:07 test245 0:(32,32,32) 0.3 sec 33: 12987 33.7% 102.8/s +05-Nov 15:05:07 test245 0:(32,32,32) 0.0 sec 8: 12979 33.8% 1675.7/s +05-Nov 15:05:07 test159 0:(32,32,32) 0.3 sec 39: 12940 33.9% 127.7/s +05-Nov 15:05:08 test259 0:(32,32,32) 0.9 sec 27: 12913 34.1% 30.0/s +05-Nov 15:05:08 test259 0:(32,32,32) 0.0 sec 1: 12912 34.1% 181.8/s +05-Nov 15:05:09 testc4(0) 0:(32,32,32) 1.0 sec 11: 12901 34.1% 10.9/s +05-Nov 15:05:09 test157 0:(32,32,32) 0.6 sec 29: 12872 34.3% 48.4/s +05-Nov 15:05:11 test182 0:(32,32,32) 1.9 sec 22: 12850 34.4% 11.6/s +05-Nov 15:05:12 test182 0:(32,32,32) 0.3 sec 13: 12837 34.5% 51.6/s +05-Nov 15:05:13 test195 0:(32,32,32) 2.0 sec 62: 12775 34.8% 31.7/s +05-Nov 15:05:17 test135 0:(32,32,32) 3.7 sec 43: 12732 35.0% 11.7/s +05-Nov 15:05:17 test215 0:(32,32,32) 0.1 sec 2: 12730 35.0% 22.8/s +05-Nov 15:05:19 test80 0:(32,32,32) 2.2 sec 11: 12719 35.1% 4.9/s +05-Nov 15:05:20 test200 0:(32,32,32) 0.7 sec 11: 12708 35.1% 15.9/s +05-Nov 15:05:22 test283 0:(32,32,32) 2.1 sec 94: 12614 35.6% 44.5/s +05-Nov 15:05:24 test283 1:(32,32,64) 1.5 sec 1: 12613 35.6% 0.7/s +05-Nov 15:05:24 test254 0:(32,32,32) 0.3 sec 25: 12588 35.7% 91.0/s +05-Nov 15:05:25 test254 0:(32,32,32) 1.1 sec 1: 12587 35.8% 0.9/s +05-Nov 15:05:28 test54 0:(32,32,32) 3.1 sec 26: 12560 35.9% 8.5/s +05-Nov 15:05:30 test54 0:(32,32,32) 1.3 sec 13: 12547 36.0% 10.4/s +05-Nov 15:05:32 testcc(1) 0:(32,32,32) 2.2 sec 10: 12537 36.0% 4.6/s +05-Nov 15:05:33 testcc(1) 0:(32,32,32) 1.4 sec 6: 12531 36.0% 4.3/s +05-Nov 15:05:33 testc2(1,1) 0:(32,32,32) 0.3 sec 11: 12520 36.1% 37.5/s +05-Nov 15:05:36 testc2(1,1) 0:(32,32,32) 2.7 sec 3: 12517 36.1% 1.1/s +05-Nov 15:05:38 test141 0:(32,32,32) 1.4 sec 521: 11996 38.8% 371.1/s +05-Nov 15:05:38 test179 0:(32,32,32) 0.0 sec 22: 11974 38.9% 751.1/s +05-Nov 15:05:38 test179 0:(32,32,32) 0.7 sec 10: 11964 38.9% 14.6/s +05-Nov 15:05:38 test188b 0:(32,32,32) 0.0 sec 39: 11925 39.1% 1474.9/s +05-Nov 15:05:38 test185 0:(32,32,32) 0.0 sec 23: 11902 39.2% 554.0/s +05-Nov 15:05:39 test256 0:(32,32,32) 0.2 sec 38: 11864 39.4% 217.6/s +05-Nov 15:05:39 test256 0:(32,32,32) 0.0 sec : 11864 39.4% +05-Nov 15:05:39 test256 1:(32,32,64) 0.2 sec 1: 11863 39.4% 5.6/s +05-Nov 15:05:39 test256 1:(32,32,64) 0.0 sec : 11863 39.4% +05-Nov 15:05:39 test238b 0:(32,32,32) 0.8 sec 31: 11832 39.6% 41.0/s +05-Nov 15:05:40 test238 0:(32,32,32) 0.5 sec 64: 11768 39.9% 133.0/s +05-Nov 15:05:41 test186 0:(32,32,32) 0.9 sec 25: 11743 40.1% 27.6/s +05-Nov 15:05:41 test186 0:(32,32,32) 0.1 sec : 11743 40.1% +05-Nov 15:05:41 test186 0:(32,32,32) 0.1 sec : 11743 40.1% +[malloc debugging turned on] +05-Nov 15:05:43 testca(1) 0:(32,32,32) 1.7 sec 38: 11705 40.3% 22.6/s +05-Nov 15:05:44 testca(1) 0:(32,32,32) 1.7 sec 2: 11703 40.3% 1.2/s +05-Nov 15:05:45 test148 0:(32,32,32) 0.4 sec 7: 11696 40.3% 15.9/s +05-Nov 15:05:45 test148 0:(32,32,32) 0.0 sec 4: 11692 40.3% 1431.1/s +05-Nov 15:05:46 test231 0:(32,32,32) 1.6 sec 385: 11307 42.3% 246.8/s +05-Nov 15:05:47 test129 0:(32,32,32) 0.7 sec 10: 11297 42.3% 15.1/s +05-Nov 15:05:52 test69 0:(32,32,32) 4.9 sec 31: 11266 42.5% 6.3/s +05-Nov 15:05:56 test69 0:(32,32,32) 3.7 sec 12: 11254 42.6% 3.2/s +05-Nov 15:05:58 test29 0:(32,32,32) 2.0 sec 151: 11103 43.3% 77.3/s +05-Nov 15:06:00 test29 0:(32,32,32) 1.9 sec 2: 11101 43.3% 1.1/s +05-Nov 15:06:02 test29 1:(32,32,64) 1.9 sec 2: 11099 43.3% 1.1/s +05-Nov 15:06:03 test29 1:(32,32,64) 1.9 sec : 11099 43.3% +05-Nov 15:06:04 test282 0:(32,32,32) 0.3 sec 15: 11084 43.4% 46.1/s +05-Nov 15:06:04 test249 0:(32,32,32) 0.3 sec 19: 11065 43.5% 56.2/s +05-Nov 15:06:04 test249 0:(32,32,32) 0.3 sec 1: 11064 43.5% 3.1/s +05-Nov 15:06:05 test196 0:(32,32,32) 0.9 sec 18: 11046 43.6% 20.4/s +05-Nov 15:06:06 test250 0:(32,32,32) 0.5 sec 69: 10977 44.0% 141.2/s +05-Nov 15:06:07 test250 0:(32,32,32) 0.7 sec 4: 10973 44.0% 5.4/s +05-Nov 15:06:07 test145 0:(32,32,32) 0.7 sec 22: 10951 44.1% 31.3/s +05-Nov 15:06:07 test145 0:(32,32,32) 0.0 sec 6: 10945 44.1% 628.2/s +05-Nov 15:06:14 test229 0:(32,32,32) 6.6 sec 14: 10931 44.2% 2.1/s +05-Nov 15:06:16 test209 0:(32,32,32) 2.0 sec 44: 10887 44.4% 22.2/s +05-Nov 15:06:18 test209 1:(32,32,64) 2.1 sec 1: 10886 44.4% 0.5/s +05-Nov 15:06:20 test224 0:(32,32,32) 1.8 sec 59: 10827 44.7% 33.2/s +05-Nov 15:06:20 test191 0:(32,32,32) 0.4 sec 26: 10801 44.9% 60.2/s +05-Nov 15:06:20 test191 0:(32,32,32) 0.1 sec 2: 10799 44.9% 20.1/s +05-Nov 15:06:20 test150 0:(32,32,32) 0.0 sec 20: 10779 45.0% 544.1/s +05-Nov 15:06:21 test240 0:(32,32,32) 0.3 sec 26: 10753 45.1% 89.7/s +05-Nov 15:06:21 test240 0:(32,32,32) 0.4 sec 1: 10752 45.1% 2.8/s +05-Nov 15:06:21 test237 0:(32,32,32) 0.3 sec 10: 10742 45.2% 31.8/s +05-Nov 15:06:21 test237 0:(32,32,32) 0.0 sec 1: 10741 45.2% 101.2/s +05-Nov 15:06:21 test237 0:(32,32,32) 0.0 sec 1: 10740 45.2% 94.1/s +05-Nov 15:06:21 test237 0:(32,32,32) 0.0 sec 1: 10739 45.2% 106.1/s +05-Nov 15:06:22 test184 0:(32,32,32) 1.1 sec 7: 10732 45.2% 6.5/s +05-Nov 15:06:26 test236 0:(32,32,32) 3.3 sec 117: 10615 45.8% 35.2/s +[malloc debugging turned off] +05-Nov 15:06:36 test84 0:(32,32,32) 10.6 sec 15: 10600 45.9% 1.4/s +05-Nov 15:06:37 test84 0:(32,32,32) 0.4 sec 32: 10568 46.1% 77.4/s +05-Nov 15:06:46 test84 2:(32,64,32) 9.0 sec 1: 10567 46.1% 0.1/s +05-Nov 15:06:46 test84 2:(32,64,32) 0.4 sec : 10567 46.1% +05-Nov 15:06:47 test84 0:(32,32,32) 0.5 sec : 10567 46.1% +05-Nov 15:06:47 test84 0:(32,32,32) 0.5 sec 4: 10563 46.1% 8.3/s +05-Nov 15:06:48 test84 2:(32,64,32) 0.5 sec 1: 10562 46.1% 2.2/s +05-Nov 15:06:48 test84 2:(32,64,32) 0.5 sec : 10562 46.1% +05-Nov 15:07:06 test173 0:(32,32,32) 17.5 sec 20: 10542 46.2% 1.1/s +05-Nov 15:07:06 test173 0:(32,32,32) 0.4 sec 4: 10538 46.2% 10.0/s +05-Nov 15:07:18 test230 0:(32,32,32) 11.6 sec 250: 10288 47.5% 21.6/s +05-Nov 15:07:19 test230 0:(32,32,32) 1.7 sec 2: 10286 47.5% 1.2/s +05-Nov 15:07:31 test18 0:(32,32,32) 12.1 sec 91: 10195 48.0% 7.5/s +05-Nov 15:07:34 test18 0:(32,32,32) 2.9 sec 7: 10188 48.0% 2.4/s +05-Nov 15:08:16 testc7(0) 0:(32,32,32) 41.3 sec 12: 10176 48.1% 0.3/s +05-Nov 15:08:22 testc7(0) 0:(32,32,32) 6.0 sec 11: 10165 48.1% 1.8/s +05-Nov 15:08:48 test193 0:(32,32,32) 25.9 sec 200: 9965 49.1% 7.7/s +05-Nov 15:08:51 test127 0:(32,32,32) 2.8 sec 929: 9036 53.9% 326.7/s +05-Nov 15:08:55 test23 0:(32,32,32) 4.5 sec 61: 8975 54.2% 13.5/s +05-Nov 15:09:02 test243 0:(32,32,32) 6.6 sec 7: 8968 54.2% 1.1/s +05-Nov 15:10:02 test53 0:(32,32,32) 60.5 sec 38: 8930 54.4% 0.6/s +05-Nov 15:10:06 test53 0:(32,32,32) 4.1 sec 5: 8925 54.4% 1.2/s +05-Nov 15:10:19 test242 0:(32,32,32) 12.3 sec 45: 8880 54.7% 3.7/s +05-Nov 15:10:30 test17 0:(32,32,32) 11.1 sec 32: 8848 54.8% 2.9/s +05-Nov 15:10:41 test246 0:(32,32,32) 10.7 sec 5: 8843 54.9% 0.5/s +05-Nov 15:10:45 test251b 0:(32,32,32) 5.0 sec 26: 8817 55.0% 5.2/s +05-Nov 15:10:56 test251 0:(32,32,32) 10.6 sec 100: 8717 55.5% 9.4/s +05-Nov 15:11:08 test152 0:(32,32,32) 12.3 sec 190: 8527 56.5% 15.4/s +05-Nov 15:11:22 test152 0:(32,32,32) 13.6 sec 123: 8404 57.1% 9.1/s +05-Nov 15:11:25 test160 0:(32,32,32) 2.5 sec 17: 8387 57.2% 6.7/s +05-Nov 15:12:00 test232 0:(32,32,32) 35.5 sec 58: 8329 57.5% 1.6/s +05-Nov 15:12:01 test232 0:(32,32,32) 0.5 sec 5: 8324 57.5% 10.4/s +05-Nov 15:12:02 test142b 0:(32,32,32) 1.6 sec 10: 8314 57.6% 6.3/s +05-Nov 15:12:02 test142b 0:(32,32,32) 0.0 sec 3: 8311 57.6% 206.9/s +05-Nov 15:12:55 test142 0:(32,32,32) 53.1 sec 304: 8007 59.1% 5.7/s +05-Nov 15:14:03 test227 0:(32,32,32) 67.7 sec 15: 7992 59.2% 0.2/s +05-Nov 15:14:12 test292 0:(32,32,32) 8.2 sec 1: 7991 59.2% 0.1/s +05-Nov 15:14:13 test192 0:(32,32,32) 1.2 sec 3: 7988 59.2% 2.5/s +05-Nov 15:14:22 test181 0:(32,32,32) 8.6 sec 4: 7984 59.2% 0.5/s +05-Nov 15:14:26 test181 0:(32,32,32) 4.8 sec 11: 7973 59.3% 2.3/s +[malloc debugging turned on] +05-Nov 15:14:44 test130 0:(32,32,32) 17.3 sec 8: 7965 59.3% 0.5/s +05-Nov 15:14:44 test130 0:(32,32,32) 0.4 sec 5: 7960 59.4% 12.7/s +05-Nov 15:14:45 test206 0:(32,32,32) 0.7 sec 86: 7874 59.8% 127.3/s +05-Nov 15:14:50 test206 0:(32,32,32) 4.8 sec 12: 7862 59.9% 2.5/s +05-Nov 15:15:04 test02 0:(32,32,32) 14.7 sec 3: 7859 59.9% 0.2/s +05-Nov 15:15:54 test11 0:(32,32,32) 49.9 sec 17: 7842 60.0% 0.3/s +05-Nov 15:16:00 test187 0:(32,32,32) 5.6 sec 5: 7837 60.0% 0.9/s +05-Nov 15:16:03 test187 0:(32,32,32) 2.3 sec 1: 7836 60.0% 0.4/s +05-Nov 15:16:05 test169 0:(32,32,32) 2.6 sec 32: 7804 60.2% 12.2/s +05-Nov 15:16:09 test76 0:(32,32,32) 4.1 sec 14: 7790 60.2% 3.4/s +05-Nov 15:16:17 test01 0:(32,32,32) 8.2 sec 590: 7200 63.2% 71.6/s +05-Nov 15:16:22 test01 0:(32,32,32) 4.8 sec 4: 7196 63.3% 0.8/s +05-Nov 15:16:27 test228 0:(32,32,32) 4.1 sec 25: 7171 63.4% 6.0/s +05-Nov 15:16:33 test104 0:(32,32,32) 6.3 sec 35: 7136 63.6% 5.6/s +05-Nov 15:17:36 test284 0:(32,32,32) 63.3 sec 68: 7068 63.9% 1.1/s +05-Nov 15:17:38 test284 0:(32,32,32) 2.1 sec 4: 7064 63.9% 1.9/s +05-Nov 15:17:58 test180 0:(32,32,32) 20.1 sec 21: 7043 64.0% 1.0/s +05-Nov 15:18:05 test180 0:(32,32,32) 6.3 sec 91: 6952 64.5% 14.5/s +05-Nov 15:18:13 test188 0:(32,32,32) 8.4 sec 169: 6783 65.4% 20.0/s +05-Nov 15:18:42 test151b 0:(32,32,32) 28.8 sec 34: 6749 65.6% 1.2/s +05-Nov 15:18:42 test151b 0:(32,32,32) 0.2 sec 18: 6731 65.6% 82.5/s +05-Nov 15:19:05 test14b 0:(32,32,32) 22.4 sec 95: 6636 66.1% 4.2/s +05-Nov 15:19:32 test14 0:(32,32,32) 27.0 sec 257: 6379 67.4% 9.5/s +[malloc debugging turned off] +05-Nov 15:20:03 test125 0:(32,32,32) 30.8 sec 319: 6060 69.1% 10.4/s +05-Nov 15:21:14 test10 0:(32,32,32) 71.4 sec 702: 5358 72.7% 9.8/s +05-Nov 15:21:51 test75b 0:(32,32,32) 36.7 sec 293: 5065 74.1% 8.0/s +05-Nov 15:22:20 test74 0:(32,32,32) 29.5 sec 4101: 964 95.1% 139.1/s +05-Nov 15:24:52 test234 0:(32,32,32) 151.6 sec 198: 766 96.1% 1.3/s +[malloc debugging turned on] +05-Nov 15:24:52 test154b 0:(32,32,32) 0.0 sec 11: 755 96.1% 575.0/s +05-Nov 15:26:53 test154 0:(32,32,32) 121.0 sec 657: 98 99.5% 5.4/s +05-Nov 15:28:03 test21b 0:(32,32,32) 69.7 sec 64: 34 99.8% 0.9/s +05-Nov 15:32:10 test19b 0:(32,32,32) 246.5 sec 20: 14 99.9% 0.1/s +05-Nov 15:32:12 test19b 0:(32,32,32) 1.6 sec 4: 10 99.9% 2.6/s +[malloc debugging turned off] +05-Nov 15:34:47 test19 0:(32,32,32) 154.5 sec 7: 3 99.9% 0.0/s +05-Nov 15:34:48 test19 0:(32,32,32) 0.9 sec 2: 1 99.9% 2.2/s +05-Nov 15:34:48 test280(0) 0:(32,32,32) 0.0 sec 1: all 100% 28.5/s +[malloc debugging turned off] +05-Nov-2025 15:34:48 grbcov ending diff --git a/GraphBLAS/Tcov/log_Oct31.txt b/GraphBLAS/Tcov/log_Oct31.txt deleted file mode 100644 index 1606254511..0000000000 --- a/GraphBLAS/Tcov/log_Oct31.txt +++ /dev/null @@ -1,332 +0,0 @@ -31-Oct-2025 15:02:32 grbcov starting -total blocks: 19585 - ----------------------------------------------- [cover] -[malloc debugging turned on] -31-Oct 15:04:15 test300 0:(32,32,32) 0.0 sec 139: 19232 1.8% 8984.0/s -31-Oct 15:04:21 test301 0:(32,32,32) 5.7 sec 365: 18867 3.7% 63.9/s -31-Oct 15:04:25 test301 0:(32,32,32) 4.7 sec 38: 18829 3.9% 8.1/s -31-Oct 15:04:26 test302 0:(32,32,32) 0.0 sec 47: 18782 4.1% 2803.0/s -[malloc debugging turned off] -31-Oct 15:04:26 test299 0:(32,32,32) 0.0 sec 62: 18720 4.4% 4009.3/s -31-Oct 15:04:27 test298 0:(32,32,32) 1.6 sec 196: 18524 5.4% 123.4/s -31-Oct 15:04:27 test298 0:(32,32,32) 0.0 sec 58: 18466 5.7% 2898.6/s -31-Oct 15:04:27 test297 0:(32,32,32) 0.3 sec 129: 18337 6.4% 459.5/s -31-Oct 15:04:27 test295 0:(32,32,32) 0.0 sec 12: 18323 6.4% 689.0/s -31-Oct 15:04:28 test294 0:(32,32,32) 0.1 sec 15: 18308 6.5% 185.4/s -31-Oct 15:04:31 test293 0:(32,32,32) 3.3 sec 46: 18262 6.8% 14.1/s -31-Oct 15:04:31 test291 0:(32,32,32) 0.0 sec 12: 18250 6.8% 697.1/s -31-Oct 15:04:31 test291 1:(32,32,64) 0.0 sec 1: 18247 6.8% 3496.5/s -31-Oct 15:04:31 test291 2:(32,64,32) 0.0 sec 4: 18243 6.9% 17699.1/s -31-Oct 15:04:31 test291 4:(64,32,32) 0.0 sec 1: 18242 6.9% 4629.6/s -31-Oct 15:04:33 test290 0:(32,32,32) 1.7 sec 2: 18240 6.9% 1.2/s -31-Oct 15:04:33 test287 0:(32,32,32) 0.0 sec 33: 18207 7.0% 1433.2/s -31-Oct 15:04:33 test287 4:(64,32,32) 0.0 sec : 18207 7.0% -31-Oct 15:04:33 test286 0:(32,32,32) 0.7 sec 40: 18167 7.2% 54.8/s -31-Oct 15:04:33 test286 0:(32,32,32) 0.0 sec 9: 18158 7.3% 2999.0/s -31-Oct 15:04:34 test286 1:(32,32,64) 0.8 sec : 18158 7.3% -31-Oct 15:04:34 test286 1:(32,32,64) 0.0 sec 1: 18157 7.3% 848.9/s -31-Oct 15:04:35 test286 2:(32,64,32) 0.7 sec : 18157 7.3% -31-Oct 15:04:35 test286 2:(32,64,32) 0.0 sec : 18157 7.3% -31-Oct 15:04:36 test286 4:(64,32,32) 0.8 sec 1: 18156 7.3% 1.2/s -31-Oct 15:04:36 test286 4:(64,32,32) 0.0 sec : 18156 7.3% -31-Oct 15:04:36 test78 0:(32,32,32) 0.5 sec 24: 18132 7.4% 48.1/s -31-Oct 15:04:36 test78 0:(32,32,32) 0.0 sec : 18132 7.4% -31-Oct 15:04:36 test78 4:(64,32,32) 0.2 sec 1: 18131 7.4% 4.3/s -31-Oct 15:04:36 test78 4:(64,32,32) 0.0 sec : 18131 7.4% -31-Oct 15:04:37 test285 0:(32,32,32) 0.7 sec 33: 18098 7.6% 49.1/s -31-Oct 15:04:37 test285 0:(32,32,32) 0.0 sec 8: 18090 7.6% 1223.2/s -31-Oct 15:04:37 test247 0:(32,32,32) 0.1 sec 22: 18068 7.7% 178.4/s -31-Oct 15:04:37 test247 0:(32,32,32) 0.1 sec 5: 18063 7.8% 52.0/s -31-Oct 15:04:38 test109 0:(32,32,32) 0.2 sec 31: 18032 7.9% 194.2/s -31-Oct 15:04:38 test109 0:(32,32,32) 0.0 sec 2: 18030 7.9% 1596.2/s -31-Oct 15:04:38 test109 0:(32,32,32) 0.1 sec 21: 18009 8.0% 190.3/s -31-Oct 15:04:38 test109 0:(32,32,32) 0.0 sec 1: 18008 8.1% 1242.2/s -31-Oct 15:04:38 test138 0:(32,32,32) 0.1 sec 27: 17981 8.2% 188.7/s -31-Oct 15:04:38 test138 0:(32,32,32) 0.0 sec 1: 17980 8.2% 32.6/s -31-Oct 15:04:38 test172 0:(32,32,32) 0.1 sec 41: 17938 8.4% 784.9/s -31-Oct 15:04:38 test172 0:(32,32,32) 0.0 sec 4: 17934 8.4% 125.0/s -31-Oct 15:04:38 test155 0:(32,32,32) 0.0 sec 102: 17832 9.0% 2444.9/s -31-Oct 15:04:38 test155 0:(32,32,32) 0.0 sec 5: 17827 9.0% 220.2/s -31-Oct 15:04:38 test155 2:(32,64,32) 0.0 sec 1: 17826 9.0% 54.6/s -31-Oct 15:04:38 test155 2:(32,64,32) 0.0 sec : 17826 9.0% -31-Oct 15:04:38 test155 4:(64,32,32) 0.0 sec 1: 17825 9.0% 53.5/s -31-Oct 15:04:38 test155 4:(64,32,32) 0.0 sec : 17825 9.0% -31-Oct 15:04:38 test174 0:(32,32,32) 0.2 sec 19: 17806 9.1% 99.3/s -31-Oct 15:04:38 test174 0:(32,32,32) 0.0 sec 6: 17800 9.1% 1340.5/s -31-Oct 15:04:38 test203 0:(32,32,32) 0.0 sec 14: 17786 9.2% 771.6/s -31-Oct 15:04:38 test213 0:(32,32,32) 0.1 sec 10: 17776 9.2% 95.9/s -31-Oct 15:04:38 test213 0:(32,32,32) 0.0 sec 1: 17775 9.2% 787.4/s -31-Oct 15:04:38 test216 0:(32,32,32) 0.0 sec 11: 17764 9.3% 446.0/s -31-Oct 15:04:39 test225 0:(32,32,32) 0.3 sec 23: 17741 9.4% 88.6/s -31-Oct 15:04:39 test225 0:(32,32,32) 0.1 sec 29: 17712 9.6% 505.8/s -31-Oct 15:04:39 test226 0:(32,32,32) 0.2 sec 11: 17701 9.6% 65.3/s -31-Oct 15:04:39 test226 0:(32,32,32) 0.0 sec 2: 17699 9.6% 222.0/s -31-Oct 15:04:39 test235 0:(32,32,32) 0.0 sec 19: 17680 9.7% 384.1/s -31-Oct 15:04:39 test235 0:(32,32,32) 0.0 sec 3: 17677 9.7% 253.9/s -31-Oct 15:04:39 test252 0:(32,32,32) 0.0 sec 15: 17662 9.8% 887.2/s -31-Oct 15:04:39 test253 0:(32,32,32) 0.0 sec 32: 17630 10.0% 695.5/s -31-Oct 15:04:39 test255 0:(32,32,32) 0.0 sec 8: 17622 10.0% 475.9/s -31-Oct 15:04:40 test257 0:(32,32,32) 0.5 sec 51: 17571 10.3% 112.0/s -31-Oct 15:04:40 test257 0:(32,32,32) 0.0 sec 2: 17569 10.3% 3149.6/s -31-Oct 15:04:40 test260 0:(32,32,32) 0.0 sec 3: 17566 10.3% 178.4/s -31-Oct 15:04:40 test261 0:(32,32,32) 0.0 sec 37: 17529 10.5% 2391.9/s -31-Oct 15:04:40 test262 0:(32,32,32) 0.0 sec 15: 17514 10.6% 977.8/s -31-Oct 15:04:40 test263 0:(32,32,32) 0.2 sec 12: 17502 10.6% 49.1/s -31-Oct 15:04:40 test263 0:(32,32,32) 0.0 sec 1: 17501 10.6% 621.1/s -31-Oct 15:04:40 test264 0:(32,32,32) 0.0 sec 146: 17355 11.4% 8079.2/s -31-Oct 15:04:40 test265 0:(32,32,32) 0.4 sec 11: 17344 11.4% 24.5/s -31-Oct 15:04:40 test265 0:(32,32,32) 0.0 sec 1: 17343 11.4% 591.7/s -31-Oct 15:04:41 test267 0:(32,32,32) 0.2 sec 16: 17327 11.5% 86.7/s -31-Oct 15:04:41 test269 0:(32,32,32) 0.0 sec 121: 17206 12.1% 6311.6/s -31-Oct 15:04:41 test271 0:(32,32,32) 0.0 sec 543: 16663 14.9% 16011.1/s -31-Oct 15:04:41 test272 0:(32,32,32) 0.0 sec 8: 16655 15.0% 475.1/s -31-Oct 15:04:41 test273 0:(32,32,32) 0.1 sec 122: 16533 15.6% 2397.6/s -31-Oct 15:04:41 test274 0:(32,32,32) 0.0 sec 119: 16414 16.2% 5565.9/s -31-Oct 15:04:41 test276 0:(32,32,32) 0.2 sec 1697: 14717 24.9% 7619.0/s -31-Oct 15:04:41 test277 0:(32,32,32) 0.0 sec 26: 14691 25.0% 1548.0/s -31-Oct 15:04:41 test279 0:(32,32,32) 0.0 sec 69: 14622 25.3% 3894.3/s -31-Oct 15:04:41 test281 0:(32,32,32) 0.0 sec 14: 14608 25.4% 861.6/s -31-Oct 15:04:41 test268 0:(32,32,32) 0.2 sec 4: 14604 25.4% 20.8/s -31-Oct 15:04:41 test268 0:(32,32,32) 0.0 sec 4: 14600 25.5% 788.0/s -31-Oct 15:04:41 test207 0:(32,32,32) 0.0 sec 7: 14593 25.5% 3558.7/s -31-Oct 15:04:41 test207 1:(32,32,64) 0.0 sec 1: 14592 25.5% 1371.7/s -31-Oct 15:04:42 test211 0:(32,32,32) 0.4 sec 14: 14578 25.6% 34.1/s -31-Oct 15:04:42 test211 0:(32,32,32) 0.0 sec 6: 14572 25.6% 970.2/s -31-Oct 15:04:42 test183 0:(32,32,32) 0.2 sec 16: 14556 25.7% 105.0/s -31-Oct 15:04:42 test212 0:(32,32,32) 0.2 sec 9: 14547 25.7% 37.6/s -31-Oct 15:04:42 test212 0:(32,32,32) 0.0 sec 5: 14542 25.7% 1814.2/s -31-Oct 15:04:42 test219 0:(32,32,32) 0.0 sec 8: 14534 25.8% 421.2/s -31-Oct 15:04:42 test219 0:(32,32,32) 0.0 sec 4: 14530 25.8% 4683.8/s -[malloc debugging turned on] -31-Oct 15:04:42 test296 0:(32,32,32) 0.0 sec 23: 14507 25.9% 8058.9/s -31-Oct 15:04:42 test289 0:(32,32,32) 0.2 sec 78: 14429 26.3% 364.3/s -31-Oct 15:04:42 test288 0:(32,32,32) 0.0 sec 20: 14409 26.4% 1083.2/s -31-Oct 15:04:42 test244 0:(32,32,32) 0.1 sec 23: 14386 26.5% 361.0/s -31-Oct 15:04:43 test244 1:(32,32,64) 0.0 sec 2: 14384 26.6% 61.7/s -31-Oct 15:04:43 test194 0:(32,32,32) 0.1 sec 24: 14360 26.7% 262.6/s -31-Oct 15:04:43 test09 0:(32,32,32) 0.2 sec 9: 14351 26.7% 41.1/s -31-Oct 15:04:43 test09 0:(32,32,32) 0.0 sec 4: 14347 26.7% 2293.6/s -31-Oct 15:04:43 test108 0:(32,32,32) 0.2 sec 51: 14296 27.0% 303.4/s -31-Oct 15:04:43 test108 0:(32,32,32) 0.0 sec 25: 14271 27.1% 665.9/s -31-Oct 15:04:43 test137 0:(32,32,32) 0.1 sec 13: 14258 27.2% 120.8/s -31-Oct 15:04:43 test137 0:(32,32,32) 0.0 sec 1: 14257 27.2% 383.3/s -31-Oct 15:04:43 test137 0:(32,32,32) 0.0 sec 1: 14256 27.2% 267.7/s -31-Oct 15:04:43 test124 0:(32,32,32) 0.2 sec 6: 14250 27.2% 39.9/s -31-Oct 15:04:48 test133 0:(32,32,32) 4.4 sec 12: 14238 27.3% 2.7/s -31-Oct 15:04:52 test133 0:(32,32,32) 4.7 sec 21: 14217 27.4% 4.5/s -31-Oct 15:04:54 test176 0:(32,32,32) 1.6 sec 26: 14191 27.5% 16.7/s -31-Oct 15:04:54 test176 0:(32,32,32) 0.1 sec 10: 14181 27.6% 150.7/s -31-Oct 15:04:54 test197 0:(32,32,32) 0.2 sec 12: 14169 27.7% 65.7/s -31-Oct 15:04:54 test197 0:(32,32,32) 0.2 sec 3: 14166 27.7% 16.3/s -31-Oct 15:04:55 test201 0:(32,32,32) 0.0 sec 23: 14143 27.8% 638.9/s -31-Oct 15:04:55 test208 0:(32,32,32) 0.0 sec 17: 14126 27.9% 344.7/s -31-Oct 15:04:55 test214 0:(32,32,32) 0.0 sec 39: 14087 28.1% 10623.8/s -31-Oct 15:04:55 test214 0:(32,32,32) 0.0 sec 5: 14082 28.1% 1756.9/s -31-Oct 15:04:55 test214 1:(32,32,64) 0.0 sec 1: 14081 28.1% 348.6/s -31-Oct 15:04:55 test214 1:(32,32,64) 0.0 sec : 14081 28.1% -31-Oct 15:04:55 test223 0:(32,32,32) 0.0 sec 11: 14070 28.2% 1276.3/s -31-Oct 15:04:55 test223 0:(32,32,32) 0.0 sec 1: 14069 28.2% 65.8/s -31-Oct 15:04:55 test241 0:(32,32,32) 0.1 sec 54: 14015 28.4% 948.2/s -31-Oct 15:04:55 test241 0:(32,32,32) 0.0 sec 8: 14007 28.5% 210.2/s -31-Oct 15:04:55 test270 0:(32,32,32) 0.0 sec 288: 13719 30.0% 11877.3/s -31-Oct 15:04:55 test199 0:(32,32,32) 0.0 sec 16: 13703 30.0% 6204.0/s -31-Oct 15:04:55 test210 0:(32,32,32) 0.1 sec : 13703 30.0% -31-Oct 15:04:55 test210 0:(32,32,32) 0.0 sec 3: 13700 30.0% 3517.0/s -31-Oct 15:04:55 test165 0:(32,32,32) 0.0 sec 13: 13687 30.1% 5917.2/s -31-Oct 15:04:55 test221 0:(32,32,32) 0.1 sec 6: 13681 30.1% 75.0/s -31-Oct 15:04:55 test221 0:(32,32,32) 0.0 sec 1: 13680 30.2% 834.0/s -31-Oct 15:04:55 test278 0:(32,32,32) 0.0 sec 53: 13627 30.4% 2798.9/s -31-Oct 15:04:55 test162 0:(32,32,32) 0.0 sec 20: 13607 30.5% 1012.2/s -31-Oct 15:04:55 test162 0:(32,32,32) 0.0 sec 1: 13606 30.5% 37.6/s -31-Oct 15:04:55 test275 0:(32,32,32) 0.0 sec 27: 13579 30.7% 1119.4/s -31-Oct 15:04:55 test220 0:(32,32,32) 0.2 sec 9: 13570 30.7% 56.9/s -31-Oct 15:04:56 test83 0:(32,32,32) 0.3 sec 7: 13563 30.7% 27.2/s -31-Oct 15:04:56 test83 0:(32,32,32) 0.0 sec 1: 13562 30.8% 1019.4/s -31-Oct 15:04:56 test04 0:(32,32,32) 0.6 sec 12: 13550 30.8% 20.2/s -31-Oct 15:04:56 test04 0:(32,32,32) 0.0 sec 8: 13542 30.9% 1173.0/s -31-Oct 15:04:56 test132 0:(32,32,32) 0.0 sec 9: 13533 30.9% 339.9/s -31-Oct 15:04:57 test82 0:(32,32,32) 0.3 sec 13: 13520 31.0% 37.2/s -31-Oct 15:04:57 test202 0:(32,32,32) 0.0 sec 71: 13449 31.3% 3938.8/s -31-Oct 15:04:57 test202 0:(32,32,32) 0.0 sec 2: 13447 31.3% 172.2/s -31-Oct 15:04:57 test202 0:(32,32,32) 0.0 sec 4: 13443 31.4% 361.8/s -31-Oct 15:04:57 test202 1:(32,32,64) 0.0 sec : 13443 31.4% -31-Oct 15:04:57 test202 1:(32,32,64) 0.0 sec 1: 13442 31.4% 85.5/s -31-Oct 15:04:57 test202 1:(32,32,64) 0.0 sec : 13442 31.4% -31-Oct 15:04:57 test202 2:(32,64,32) 0.0 sec 1: 13441 31.4% 102.7/s -31-Oct 15:04:57 test202 2:(32,64,32) 0.0 sec : 13441 31.4% -31-Oct 15:04:57 test202 2:(32,64,32) 0.0 sec : 13441 31.4% -31-Oct 15:04:57 test222 0:(32,32,32) 0.0 sec 40: 13401 31.6% 1057.8/s -31-Oct 15:04:57 test204 0:(32,32,32) 0.0 sec 9: 13392 31.6% 553.7/s -31-Oct 15:04:57 test258 0:(32,32,32) 0.2 sec 13: 13379 31.7% 65.0/s -31-Oct 15:04:57 test258 0:(32,32,32) 0.0 sec 2: 13377 31.7% 1310.6/s -31-Oct 15:04:57 test258 1:(32,32,64) 0.0 sec 1: 13376 31.7% 20.2/s -31-Oct 15:04:57 test258 1:(32,32,64) 0.0 sec : 13376 31.7% -31-Oct 15:04:58 test136 0:(32,32,32) 1.2 sec 7: 13369 31.7% 5.6/s -31-Oct 15:04:58 test136 0:(32,32,32) 0.0 sec 53: 13316 32.0% 3753.8/s -31-Oct 15:05:00 test128 0:(32,32,32) 1.5 sec 52: 13264 32.3% 34.0/s -31-Oct 15:05:00 test128 0:(32,32,32) 0.6 sec 1: 13263 32.3% 1.7/s -31-Oct 15:05:01 test144 0:(32,32,32) 0.4 sec 5: 13258 32.3% 11.2/s -31-Oct 15:05:07 test81 0:(32,32,32) 6.5 sec 39: 13219 32.5% 6.0/s -[malloc debugging turned off] -31-Oct 15:05:08 testc2(0,0) 0:(32,32,32) 0.5 sec 172: 13047 33.4% 359.0/s -31-Oct 15:05:08 testc2(0,0) 1:(32,32,64) 0.3 sec 1: 13046 33.4% 3.4/s -31-Oct 15:05:08 test239 0:(32,32,32) 0.0 sec 10: 13036 33.4% 1106.2/s -31-Oct 15:05:09 test239 0:(32,32,32) 0.3 sec 1: 13035 33.4% 3.0/s -31-Oct 15:05:09 test245 0:(32,32,32) 0.3 sec 32: 13003 33.6% 115.7/s -31-Oct 15:05:09 test245 0:(32,32,32) 0.0 sec 8: 12995 33.6% 1483.7/s -31-Oct 15:05:09 test159 0:(32,32,32) 0.3 sec 39: 12956 33.8% 117.4/s -31-Oct 15:05:10 test259 0:(32,32,32) 0.8 sec 27: 12929 34.0% 33.2/s -31-Oct 15:05:10 test259 0:(32,32,32) 0.0 sec 1: 12928 34.0% 210.0/s -31-Oct 15:05:11 testc4(0) 0:(32,32,32) 1.0 sec 11: 12917 34.0% 11.1/s -31-Oct 15:05:12 test157 0:(32,32,32) 0.6 sec 42: 12875 34.3% 73.5/s -31-Oct 15:05:13 test182 0:(32,32,32) 1.8 sec 22: 12853 34.4% 11.9/s -31-Oct 15:05:14 test182 0:(32,32,32) 0.3 sec 13: 12840 34.4% 52.0/s -31-Oct 15:05:16 test195 0:(32,32,32) 2.1 sec 62: 12778 34.8% 30.0/s -31-Oct 15:05:20 test135 0:(32,32,32) 3.8 sec 43: 12735 35.0% 11.2/s -31-Oct 15:05:20 test215 0:(32,32,32) 0.1 sec 2: 12733 35.0% 23.0/s -31-Oct 15:05:22 test80 0:(32,32,32) 2.6 sec 11: 12722 35.0% 4.2/s -31-Oct 15:05:23 test200 0:(32,32,32) 0.9 sec 11: 12711 35.1% 11.9/s -31-Oct 15:05:26 test283 0:(32,32,32) 2.7 sec 93: 12618 35.6% 34.5/s -31-Oct 15:05:28 test283 1:(32,32,64) 1.8 sec 1: 12617 35.6% 0.6/s -31-Oct 15:05:28 test254 0:(32,32,32) 0.2 sec 25: 12592 35.7% 115.9/s -31-Oct 15:05:29 test254 0:(32,32,32) 1.1 sec 1: 12591 35.7% 0.9/s -31-Oct 15:05:32 test54 0:(32,32,32) 3.1 sec 26: 12564 35.8% 8.5/s -31-Oct 15:05:33 test54 0:(32,32,32) 1.3 sec 13: 12551 35.9% 10.3/s -31-Oct 15:05:36 testcc(1) 0:(32,32,32) 2.2 sec 10: 12541 36.0% 4.6/s -31-Oct 15:05:37 testcc(1) 0:(32,32,32) 1.4 sec 6: 12535 36.0% 4.3/s -31-Oct 15:05:37 testc2(1,1) 0:(32,32,32) 0.3 sec 11: 12524 36.1% 36.8/s -31-Oct 15:05:40 testc2(1,1) 0:(32,32,32) 2.7 sec 3: 12521 36.1% 1.1/s -31-Oct 15:05:41 test141 0:(32,32,32) 1.3 sec 521: 12000 38.7% 392.9/s -31-Oct 15:05:41 test179 0:(32,32,32) 0.0 sec 22: 11978 38.8% 714.2/s -31-Oct 15:05:42 test179 0:(32,32,32) 0.7 sec 10: 11968 38.9% 14.4/s -31-Oct 15:05:42 test188b 0:(32,32,32) 0.0 sec 39: 11929 39.1% 1397.2/s -31-Oct 15:05:42 test185 0:(32,32,32) 0.0 sec 23: 11906 39.2% 583.2/s -31-Oct 15:05:42 test256 0:(32,32,32) 0.2 sec 38: 11868 39.4% 214.4/s -31-Oct 15:05:42 test256 0:(32,32,32) 0.0 sec : 11868 39.4% -31-Oct 15:05:42 test256 1:(32,32,64) 0.2 sec 1: 11867 39.4% 5.6/s -31-Oct 15:05:42 test256 1:(32,32,64) 0.0 sec : 11867 39.4% -31-Oct 15:05:43 test238b 0:(32,32,32) 0.7 sec 31: 11836 39.6% 43.8/s -31-Oct 15:05:44 test238 0:(32,32,32) 0.5 sec 64: 11772 39.9% 134.1/s -31-Oct 15:05:45 test186 0:(32,32,32) 0.9 sec 25: 11747 40.0% 28.0/s -31-Oct 15:05:45 test186 0:(32,32,32) 0.1 sec : 11747 40.0% -31-Oct 15:05:45 test186 0:(32,32,32) 0.1 sec : 11747 40.0% -[malloc debugging turned on] -31-Oct 15:05:47 testca(1) 0:(32,32,32) 1.7 sec 38: 11709 40.2% 21.9/s -31-Oct 15:05:48 testca(1) 0:(32,32,32) 1.7 sec 2: 11707 40.2% 1.2/s -31-Oct 15:05:49 test148 0:(32,32,32) 0.4 sec 7: 11700 40.3% 16.1/s -31-Oct 15:05:49 test148 0:(32,32,32) 0.0 sec 4: 11696 40.3% 1480.9/s -31-Oct 15:05:50 test231 0:(32,32,32) 1.3 sec 385: 11311 42.2% 305.5/s -31-Oct 15:05:50 test129 0:(32,32,32) 0.6 sec 10: 11301 42.3% 17.8/s -31-Oct 15:05:55 test69 0:(32,32,32) 4.9 sec 31: 11270 42.5% 6.3/s -31-Oct 15:05:59 test69 0:(32,32,32) 3.7 sec 12: 11258 42.5% 3.3/s -31-Oct 15:06:01 test29 0:(32,32,32) 1.9 sec 151: 11107 43.3% 80.2/s -31-Oct 15:06:03 test29 0:(32,32,32) 1.8 sec 2: 11105 43.3% 1.1/s -31-Oct 15:06:05 test29 1:(32,32,64) 1.8 sec 2: 11103 43.3% 1.1/s -31-Oct 15:06:06 test29 1:(32,32,64) 1.8 sec : 11103 43.3% -31-Oct 15:06:07 test282 0:(32,32,32) 0.3 sec 15: 11088 43.4% 46.4/s -31-Oct 15:06:07 test249 0:(32,32,32) 0.3 sec 19: 11069 43.5% 56.3/s -31-Oct 15:06:07 test249 0:(32,32,32) 0.3 sec 1: 11068 43.5% 3.1/s -31-Oct 15:06:08 test196 0:(32,32,32) 0.5 sec 18: 11050 43.6% 33.8/s -31-Oct 15:06:08 test250 0:(32,32,32) 0.5 sec 69: 10981 43.9% 135.5/s -31-Oct 15:06:09 test250 0:(32,32,32) 0.7 sec 4: 10977 44.0% 5.8/s -31-Oct 15:06:10 test145 0:(32,32,32) 0.7 sec 22: 10955 44.1% 32.2/s -31-Oct 15:06:10 test145 0:(32,32,32) 0.0 sec 6: 10949 44.1% 720.5/s -31-Oct 15:06:16 test229 0:(32,32,32) 5.9 sec 14: 10935 44.2% 2.4/s -31-Oct 15:06:18 test209 0:(32,32,32) 2.0 sec 44: 10891 44.4% 22.1/s -31-Oct 15:06:20 test209 1:(32,32,64) 2.1 sec 1: 10890 44.4% 0.5/s -31-Oct 15:06:21 test224 0:(32,32,32) 1.5 sec 60: 10830 44.7% 40.3/s -31-Oct 15:06:22 test191 0:(32,32,32) 0.4 sec 26: 10804 44.8% 62.6/s -31-Oct 15:06:22 test191 0:(32,32,32) 0.1 sec 2: 10802 44.8% 20.9/s -31-Oct 15:06:22 test150 0:(32,32,32) 0.0 sec 20: 10782 44.9% 585.1/s -31-Oct 15:06:22 test240 0:(32,32,32) 0.3 sec 26: 10756 45.1% 88.1/s -31-Oct 15:06:23 test240 0:(32,32,32) 0.3 sec 1: 10755 45.1% 2.9/s -31-Oct 15:06:23 test237 0:(32,32,32) 0.3 sec 10: 10745 45.1% 31.8/s -31-Oct 15:06:23 test237 0:(32,32,32) 0.0 sec 1: 10744 45.1% 100.6/s -31-Oct 15:06:23 test237 0:(32,32,32) 0.0 sec 1: 10743 45.1% 94.7/s -31-Oct 15:06:23 test237 0:(32,32,32) 0.0 sec 1: 10742 45.2% 115.7/s -31-Oct 15:06:24 test184 0:(32,32,32) 1.1 sec 7: 10735 45.2% 6.5/s -31-Oct 15:06:27 test236 0:(32,32,32) 3.0 sec 117: 10618 45.8% 39.4/s -[malloc debugging turned off] -31-Oct 15:06:38 test84 0:(32,32,32) 10.7 sec 15: 10603 45.9% 1.4/s -31-Oct 15:06:38 test84 0:(32,32,32) 0.4 sec 32: 10571 46.0% 75.9/s -31-Oct 15:06:47 test84 2:(32,64,32) 9.1 sec 1: 10570 46.0% 0.1/s -31-Oct 15:06:48 test84 2:(32,64,32) 0.4 sec : 10570 46.0% -31-Oct 15:06:48 test84 0:(32,32,32) 0.5 sec : 10570 46.0% -31-Oct 15:06:49 test84 0:(32,32,32) 0.5 sec 4: 10566 46.1% 8.4/s -31-Oct 15:06:49 test84 2:(32,64,32) 0.5 sec 1: 10565 46.1% 2.2/s -31-Oct 15:06:50 test84 2:(32,64,32) 0.5 sec : 10565 46.1% -31-Oct 15:07:07 test173 0:(32,32,32) 17.6 sec 20: 10545 46.2% 1.1/s -31-Oct 15:07:08 test173 0:(32,32,32) 0.4 sec 4: 10541 46.2% 10.1/s -31-Oct 15:07:19 test230 0:(32,32,32) 11.3 sec 250: 10291 47.5% 22.0/s -31-Oct 15:07:21 test230 0:(32,32,32) 1.7 sec 2: 10289 47.5% 1.2/s -31-Oct 15:07:32 test18 0:(32,32,32) 11.9 sec 91: 10198 47.9% 7.7/s -31-Oct 15:07:35 test18 0:(32,32,32) 2.8 sec 7: 10191 48.0% 2.5/s -31-Oct 15:08:16 testc7(0) 0:(32,32,32) 41.0 sec 12: 10179 48.0% 0.3/s -31-Oct 15:08:22 testc7(0) 0:(32,32,32) 5.9 sec 11: 10168 48.1% 1.9/s -31-Oct 15:08:47 test193 0:(32,32,32) 24.8 sec 200: 9968 49.1% 8.1/s -31-Oct 15:08:50 test127 0:(32,32,32) 2.7 sec 929: 9039 53.8% 342.6/s -31-Oct 15:08:54 test23 0:(32,32,32) 4.3 sec 61: 8978 54.2% 14.2/s -31-Oct 15:09:00 test243 0:(32,32,32) 6.3 sec 7: 8971 54.2% 1.1/s -31-Oct 15:10:01 test53 0:(32,32,32) 60.2 sec 38: 8933 54.4% 0.6/s -31-Oct 15:10:05 test53 0:(32,32,32) 4.0 sec 5: 8928 54.4% 1.2/s -31-Oct 15:10:17 test242 0:(32,32,32) 12.1 sec 45: 8883 54.6% 3.7/s -31-Oct 15:10:28 test17 0:(32,32,32) 10.7 sec 32: 8851 54.8% 3.0/s -31-Oct 15:10:37 test246 0:(32,32,32) 9.7 sec 5: 8846 54.8% 0.5/s -31-Oct 15:10:42 test251b 0:(32,32,32) 5.0 sec 26: 8820 55.0% 5.2/s -31-Oct 15:10:53 test251 0:(32,32,32) 10.7 sec 100: 8720 55.5% 9.4/s -31-Oct 15:11:05 test152 0:(32,32,32) 11.9 sec 190: 8530 56.4% 15.9/s -31-Oct 15:11:18 test152 0:(32,32,32) 13.6 sec 123: 8407 57.1% 9.1/s -31-Oct 15:11:21 test160 0:(32,32,32) 2.7 sec 17: 8390 57.2% 6.3/s -31-Oct 15:11:57 test232 0:(32,32,32) 35.4 sec 58: 8332 57.5% 1.6/s -31-Oct 15:11:57 test232 0:(32,32,32) 0.4 sec 5: 8327 57.5% 11.1/s -31-Oct 15:11:59 test142b 0:(32,32,32) 1.6 sec 10: 8317 57.5% 6.2/s -31-Oct 15:11:59 test142b 0:(32,32,32) 0.0 sec 3: 8314 57.5% 206.9/s -31-Oct 15:12:52 test142 0:(32,32,32) 52.7 sec 304: 8010 59.1% 5.8/s -31-Oct 15:14:00 test227 0:(32,32,32) 68.7 sec 15: 7995 59.2% 0.2/s -31-Oct 15:14:09 test292 0:(32,32,32) 8.5 sec 1: 7994 59.2% 0.1/s -31-Oct 15:14:10 test192 0:(32,32,32) 1.2 sec 3: 7991 59.2% 2.6/s -31-Oct 15:14:19 test181 0:(32,32,32) 8.5 sec 4: 7987 59.2% 0.5/s -31-Oct 15:14:23 test181 0:(32,32,32) 4.7 sec 11: 7976 59.3% 2.4/s -[malloc debugging turned on] -31-Oct 15:14:40 test130 0:(32,32,32) 16.9 sec 8: 7968 59.3% 0.5/s -31-Oct 15:14:41 test130 0:(32,32,32) 0.4 sec 5: 7963 59.3% 13.0/s -31-Oct 15:14:41 test206 0:(32,32,32) 0.6 sec 86: 7877 59.8% 132.4/s -31-Oct 15:14:46 test206 0:(32,32,32) 4.9 sec 12: 7865 59.8% 2.5/s -31-Oct 15:15:01 test02 0:(32,32,32) 14.9 sec 3: 7862 59.9% 0.2/s -31-Oct 15:15:52 test11 0:(32,32,32) 50.5 sec 17: 7845 59.9% 0.3/s -31-Oct 15:15:57 test187 0:(32,32,32) 5.3 sec 5: 7840 60.0% 0.9/s -31-Oct 15:15:59 test187 0:(32,32,32) 2.1 sec 1: 7839 60.0% 0.5/s -31-Oct 15:16:02 test169 0:(32,32,32) 2.6 sec 32: 7807 60.1% 12.2/s -31-Oct 15:16:06 test76 0:(32,32,32) 3.9 sec 14: 7793 60.2% 3.6/s -31-Oct 15:16:14 test01 0:(32,32,32) 7.9 sec 592: 7201 63.2% 75.2/s -31-Oct 15:16:19 test01 0:(32,32,32) 4.9 sec 4: 7197 63.3% 0.8/s -31-Oct 15:16:23 test228 0:(32,32,32) 4.0 sec 25: 7172 63.4% 6.2/s -31-Oct 15:16:28 test104 0:(32,32,32) 5.1 sec 35: 7137 63.6% 6.8/s -31-Oct 15:17:30 test284 0:(32,32,32) 62.0 sec 68: 7069 63.9% 1.1/s -31-Oct 15:17:32 test284 0:(32,32,32) 2.1 sec 4: 7065 63.9% 1.9/s -31-Oct 15:17:53 test180 0:(32,32,32) 20.6 sec 21: 7044 64.0% 1.0/s -31-Oct 15:17:59 test180 0:(32,32,32) 6.2 sec 91: 6953 64.5% 14.7/s -31-Oct 15:18:08 test188 0:(32,32,32) 8.8 sec 169: 6784 65.4% 19.2/s -31-Oct 15:18:37 test151b 0:(32,32,32) 28.9 sec 34: 6750 65.5% 1.2/s -31-Oct 15:18:37 test151b 0:(32,32,32) 0.2 sec 18: 6732 65.6% 83.2/s -31-Oct 15:18:59 test14b 0:(32,32,32) 22.3 sec 95: 6637 66.1% 4.3/s -31-Oct 15:19:26 test14 0:(32,32,32) 26.4 sec 257: 6380 67.4% 9.7/s -[malloc debugging turned off] -31-Oct 15:19:57 test125 0:(32,32,32) 30.7 sec 319: 6061 69.1% 10.4/s -31-Oct 15:21:08 test10 0:(32,32,32) 71.7 sec 702: 5359 72.6% 9.8/s -31-Oct 15:21:46 test75b 0:(32,32,32) 37.0 sec 293: 5066 74.1% 7.9/s -31-Oct 15:22:15 test74 0:(32,32,32) 29.5 sec 4101: 965 95.1% 139.2/s -31-Oct 15:24:46 test234 0:(32,32,32) 150.8 sec 198: 767 96.1% 1.3/s -[malloc debugging turned on] -31-Oct 15:24:46 test154b 0:(32,32,32) 0.0 sec 12: 755 96.1% 563.6/s -31-Oct 15:26:48 test154 0:(32,32,32) 122.1 sec 657: 98 99.5% 5.4/s -31-Oct 15:27:57 test21b 0:(32,32,32) 68.2 sec 64: 34 99.8% 0.9/s -31-Oct 15:32:04 test19b 0:(32,32,32) 246.9 sec 20: 14 99.9% 0.1/s -31-Oct 15:32:06 test19b 0:(32,32,32) 1.6 sec 4: 10 99.9% 2.6/s -[malloc debugging turned off] -31-Oct 15:34:41 test19 0:(32,32,32) 154.9 sec 7: 3 99.9% 0.0/s -31-Oct 15:34:42 test19 0:(32,32,32) 0.9 sec 2: 1 99.9% 2.1/s -31-Oct 15:34:42 test280(0) 0:(32,32,32) 0.0 sec 1: all 100% 26.1/s -[malloc debugging turned off] -31-Oct-2025 15:34:42 grbcov ending diff --git a/GraphBLAS/Test/GB_mex_edit.c b/GraphBLAS/Test/GB_mex_edit.c index 2d559c41f4..eab0b68476 100644 --- a/GraphBLAS/Test/GB_mex_edit.c +++ b/GraphBLAS/Test/GB_mex_edit.c @@ -8,15 +8,18 @@ //------------------------------------------------------------------------------ #include "GB_mex.h" +#include "GB_mex_errors.h" #define USAGE "C = GB_mex_edit (C, I, J, X, Action)" #define FREE_ALL \ { \ GrB_Matrix_free_(&C) ; \ + GrB_Matrix_free_(&B) ; \ GB_mx_put_global (true) ; \ } +#undef OK #define OK(method) \ { \ info = method ; \ @@ -27,6 +30,9 @@ } \ } +#define GET_DEEP_COPY +#define FREE_DEEP_COPY ; + void mexFunction ( int nargout, @@ -36,7 +42,7 @@ void mexFunction ) { - GrB_Matrix C = NULL ; + GrB_Matrix C = NULL, B = NULL ; uint64_t *I = NULL, ni = 0, I_range [3] ; // OK uint64_t *J = NULL, nj = 0, J_range [3] ; // OK bool ignore ; @@ -170,6 +176,16 @@ void mexFunction OK (GrB_Scalar_free (&Scalar)) ; GB_Global_malloc_debug_set (save) ; + //-------------------------------------------------------------------------- + // duplicate the matrix and check if it stays the same + //-------------------------------------------------------------------------- + + METHOD (GrB_Matrix_dup (&B, C)) ; + CHECK (GB_mx_isequal (B, C, 0)) ; + OK (GrB_Matrix_wait (C, GrB_MATERIALIZE)) ; + OK (GrB_Matrix_wait (B, GrB_MATERIALIZE)) ; + CHECK (GB_mx_isequal (B, C, 0)) ; + //-------------------------------------------------------------------------- // return C as a built-in sparse matrix //-------------------------------------------------------------------------- diff --git a/GraphBLAS/Test/GB_mex_msort_1.c b/GraphBLAS/Test/GB_mex_msort_1.c index 9e9bd05fe1..54f4874d01 100644 --- a/GraphBLAS/Test/GB_mex_msort_1.c +++ b/GraphBLAS/Test/GB_mex_msort_1.c @@ -11,6 +11,8 @@ #define USAGE "[I] = GB_mex_msort_1 (I,nthreads)" +#define WALLCLOCK GB_omp_get_wtime ( ) + void mexFunction ( int nargout, @@ -52,7 +54,10 @@ void mexFunction void *Iout = mxGetData (pargout [0]) ; memcpy (Iout, I, n * (I_is_32 ? sizeof (uint32_t) : sizeof (uint64_t))) ; + double t = WALLCLOCK ; GB_msort_1 (Iout, I_is_32, n, nthreads) ; + t = WALLCLOCK - t ; + printf ("nthreads %d, n: %ld, time: %g\n", nthreads, n, t) ; GB_mx_put_global (true) ; } diff --git a/GraphBLAS/Test/GB_mex_test2.c b/GraphBLAS/Test/GB_mex_test2.c index b56b5e12cb..df1690a7d9 100644 --- a/GraphBLAS/Test/GB_mex_test2.c +++ b/GraphBLAS/Test/GB_mex_test2.c @@ -267,6 +267,7 @@ void mexFunction OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ; OK (GrB_Matrix_setElement_INT32 (A, 12345, 0, 0)) ; + OK (GrB_Matrix_wait (A, GrB_MATERIALIZE)) ; OK (GrB_Matrix_dup (&C, A)) ; CHECK (!GB_any_aliased (A, C)) ; GB_free_memory (&(C->p), C->p_size) ; diff --git a/GraphBLAS/Test/test303.m b/GraphBLAS/Test/test303.m new file mode 100644 index 0000000000..667383710a --- /dev/null +++ b/GraphBLAS/Test/test303.m @@ -0,0 +1,28 @@ +function test303 +%TEST303 test C=A(I,J), method 6 + +% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved. +% SPDX-License-Identifier: Apache-2.0 + +fprintf ('test303 ------------------ C=A(I,J)\n') ; + +% construct the problem +n = 2000 ; +rng ('default') ; +A = sprand (n, n, 0.5) ; +nI = 4 ; +I = randi (2000, nI, 1) ; +I0 = uint64 (I-1) ; +I0 + +% test method 6 in GrB_extract +% C1 = A (I,:) ; +C1 = GB_mex_Matrix_subref (A, I0, [ ]) ; + +% compare with MATLAB +B = double (A) ; +C2 = B (I,:) ; +assert (isequal (C1, C2)) ; + +fprintf ('\ntest303: tests passed\n') ; + diff --git a/GraphBLAS/Test/testall.m b/GraphBLAS/Test/testall.m index 19210ee4e4..05025e2795 100644 --- a/GraphBLAS/Test/testall.m +++ b/GraphBLAS/Test/testall.m @@ -120,9 +120,11 @@ function testall (threads, mdebug) % < 1 second: debug_on set_malloc_debug (mdebug, 1) ; +logstat ('test303' ,t, J404 , F110 ) ; % C=A(I,J), method 6 logstat ('test300' ,t, J0 , F0 ) ; % print function for a type logstat ('test301' ,t, J40 , F11 ) ; % assign method27, C+=A logstat ('test302' ,t, J0 , F0 ) ; % GPU controls +logstat ('test155' ,t, J40 , F10 , [0 2 4]) ; % setElement, removeElement % < 1 second: debug_off set_malloc_debug (mdebug, 0) ; @@ -142,7 +144,6 @@ function testall (threads, mdebug) logstat ('test109' ,t, J4040, F1100) ; % terminal monoid with user-defn type logstat ('test138' ,s, J40 , F10 ) ; % assign, coarse-only in IxJ slice logstat ('test172' ,t, J40 , F10 ) ; % eWiseMult with M bitmap/full -logstat ('test155' ,t, J40 , F10 , [0 2 4]) ; % setElement, removeElement logstat ('test174' ,t, J40 , F10 ) ; % GrB_assign C=A logstat ('test203' ,t, J4 , F1 ) ; % iso subref logstat ('test213' ,t, J40 , F10 ) ; % iso assign (method 05d) diff --git a/GraphBLAS/cmake_modules/GraphBLASReport.cmake b/GraphBLAS/cmake_modules/GraphBLASReport.cmake index 70ec9765cd..f5852bd13c 100644 --- a/GraphBLAS/cmake_modules/GraphBLASReport.cmake +++ b/GraphBLAS/cmake_modules/GraphBLASReport.cmake @@ -28,4 +28,13 @@ else ( ) endif ( ) get_property ( CDEFN DIRECTORY PROPERTY COMPILE_DEFINITIONS ) message ( STATUS "compile definitions: ${CDEFN}") +message ( STATUS "GraphBLAS has CUDA: ${GRAPHBLAS_HAS_CUDA}") + +if ( GRAPHBLAS_HAS_CUDA ) + message ( STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER} ") + message ( STATUS "CUDA flags: ${CMAKE_CUDA_FLAGS} ") + message ( STATUS "CUDA release: ${CMAKE_CUDA_FLAGS_RELEASE} ") + message ( STATUS "CUDA flags debug: ${CMAKE_CUDA_FLAGS_DEBUG} ") +endif ( ) + message ( STATUS "------------------------------------------------------------------------" ) diff --git a/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake b/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake index ba988c4686..9c3665e79a 100644 --- a/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake +++ b/GraphBLAS/cmake_modules/GraphBLAS_JIT_configure.cmake @@ -73,11 +73,18 @@ if ( GRAPHBLAS_JIT_ENABLE_RELOCATE ) # convert to -l flags to avoid relocation issues, i.e.: "-lgomp -lpthread -lm" set ( GB_C_LIBRARIES "" ) foreach ( _lib ${GB_CMAKE_LIBRARIES} ) + + # skip CUDA::cuda_driver, etc + if ( ${_lib} MATCHES "CUDA::" ) + continue ( ) + endif ( ) + string ( FIND ${_lib} "." _pos REVERSE ) if ( ${_pos} EQUAL "-1" ) set ( GB_C_LIBRARIES "${GB_C_LIBRARIES} -l${_lib}" ) continue () endif ( ) + set ( _kinds "SHARED" "STATIC" ) if ( WIN32 ) list ( PREPEND _kinds "IMPORT" ) @@ -101,16 +108,20 @@ else ( ) string ( REPLACE "." "\\." LIBSUFFIX2 ${CMAKE_STATIC_LIBRARY_SUFFIX} ) set ( GB_C_LIBRARIES "" ) foreach ( LIB_NAME ${GB_CMAKE_LIBRARIES} ) - if (( LIB_NAME MATCHES ${LIBSUFFIX1} ) OR ( LIB_NAME MATCHES ${LIBSUFFIX2} )) + message ( STATUS "lib: ${LIB_NAME} " ) + if ( LIB_NAME MATCHES "CUDA::" ) + continue ( ) + elseif (( LIB_NAME MATCHES ${LIBSUFFIX1} ) OR ( LIB_NAME MATCHES ${LIBSUFFIX2} )) string ( APPEND GB_C_LIBRARIES " " ${LIB_NAME} ) else ( ) string ( APPEND GB_C_LIBRARIES " -l" ${LIB_NAME} ) endif ( ) + endforeach ( ) endif ( ) -if ( GRAPHBLAS_USE_JIT OR GRAPHBLAS_USE_CUDA ) +if ( GRAPHBLAS_USE_JIT OR GRAPHBLAS_HAS_CUDA ) message ( STATUS "------------------------------------------------------------------------" ) message ( STATUS "JIT configuration:" ) message ( STATUS "------------------------------------------------------------------------" ) diff --git a/GraphBLAS/cmake_modules/GraphBLAS_JIT_paths.cmake b/GraphBLAS/cmake_modules/GraphBLAS_JIT_paths.cmake index f03ddeca34..eeb66063d5 100644 --- a/GraphBLAS/cmake_modules/GraphBLAS_JIT_paths.cmake +++ b/GraphBLAS/cmake_modules/GraphBLAS_JIT_paths.cmake @@ -38,11 +38,6 @@ endif ( ) # GRAPHBLAS_USE_JIT and GRAPHBLAS_COMPACT options #------------------------------------------------------------------------------- -if ( SUITESPARSE_HAS_CUDA AND GRAPHBLAS_USE_CUDA ) - # FOR NOW: do not compile FactoryKernels when developing the CUDA kernels -# set ( GRAPHBLAS_COMPACT on ) # Fixme for CUDA -endif ( ) - include ( CheckSymbolExists ) check_symbol_exists ( system "stdlib.h" HAVE_C_SYSTEM ) diff --git a/GraphBLAS/cmake_modules/GraphBLAS_version.cmake b/GraphBLAS/cmake_modules/GraphBLAS_version.cmake index 3e04bdab67..b2be1d004c 100644 --- a/GraphBLAS/cmake_modules/GraphBLAS_version.cmake +++ b/GraphBLAS/cmake_modules/GraphBLAS_version.cmake @@ -8,10 +8,10 @@ #------------------------------------------------------------------------------- # version of SuiteSparse:GraphBLAS -set ( GraphBLAS_DATE "Nov 1, 2025" ) +set ( GraphBLAS_DATE "Jan 21, 2026" ) set ( GraphBLAS_VERSION_MAJOR 10 CACHE STRING "" FORCE ) -set ( GraphBLAS_VERSION_MINOR 2 CACHE STRING "" FORCE ) -set ( GraphBLAS_VERSION_SUB 0 CACHE STRING "" FORCE ) +set ( GraphBLAS_VERSION_MINOR 3 CACHE STRING "" FORCE ) +set ( GraphBLAS_VERSION_SUB 1 CACHE STRING "" FORCE ) # GraphBLAS C API Specification version, at graphblas.org set ( GraphBLAS_API_DATE "Dec 22, 2023" ) diff --git a/GraphBLAS/rmm_wrap/CMakeLists.txt b/GraphBLAS/rmm_wrap/CMakeLists.txt index e4d3c109f9..ac75daf20a 100644 --- a/GraphBLAS/rmm_wrap/CMakeLists.txt +++ b/GraphBLAS/rmm_wrap/CMakeLists.txt @@ -17,6 +17,7 @@ set ( CMAKE_C_STANDARD 99 ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) find_package ( CUDAToolkit REQUIRED ) +message ( STATUS "CUDA include dirs: ${CUDA_INCLUDE_DIRS} " ) set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes ) @@ -25,9 +26,9 @@ if ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY} ) endif ( ) if ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/spdlog ) - message ( status "cloning spdlog v1.10.0" ) + message ( STATUS "cloning spdlog v1.10.0" ) execute_process ( - COMMAND git clone "https://github.com/gabime/spdlog.git" --branch v1.10.0 --recursive spdlog + COMMAND git clone "https://github.com/gabime/spdlog" --branch v1.10.0 --recursive spdlog WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes ) endif ( ) @@ -35,9 +36,9 @@ set ( SPDLOG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external_includes/spdlog/in include_directories ( ${SPDLOG_INCLUDE_DIR} ) if ( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/rmm ) - message ( status "cloning rmm branch-21.10" ) + message ( STATUS "cloning rmm branch-21.10" ) execute_process ( - COMMAND git clone "https://github.com/rapidsai/rmm.git" --branch branch-21.10 --recursive rmm + COMMAND git clone "https://github.com/rapidsai/rmm" --branch branch-21.10 --recursive rmm WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes ) endif() diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.cpp b/GraphBLAS/rmm_wrap/rmm_wrap.cpp index aecfbe15ab..b2bfe90708 100644 --- a/GraphBLAS/rmm_wrap/rmm_wrap.cpp +++ b/GraphBLAS/rmm_wrap/rmm_wrap.cpp @@ -50,8 +50,8 @@ typedef struct std::shared_ptr resource; std::shared_ptr host_resource; std::shared_ptr size_map ; - std::shared_ptr stream_pool; - cudaStream_t main_stream; +// std::shared_ptr stream_pool; +// cudaStream_t main_stream; } RMM_Wrap_Handle ; @@ -144,6 +144,7 @@ inline auto make_and_set_managed_pool return resource; } +#if 0 inline std::shared_ptr make_and_set_cuda_stream_pool ( std::size_t num_streams @@ -151,6 +152,7 @@ inline std::shared_ptr make_and_set_cuda_stream_pool { return std::make_shared(num_streams); } +#endif //------------------------------------------------------------------------------ // rmm_wrap_is_initialized: determine if rmm_wrap_context exists @@ -169,6 +171,8 @@ bool rmm_wrap_is_initialized (void) // the rmm_wrap_context: the memory resource (host or device) and the // alloc_map. +// FIXME: GraphBLAS currently does not call this method ... + void rmm_wrap_finalize (void) { try @@ -177,7 +181,7 @@ void rmm_wrap_finalize (void) { for (int device_id = 0; device_id < devices.size(); ++device_id) { - RMM_WRAP_CHECK_CUDA(cudaStreamDestroy(rmm_wrap_context[device_id]->main_stream)); +// RMM_WRAP_CHECK_CUDA(cudaStreamDestroy(rmm_wrap_context[device_id]->main_stream)); delete rmm_wrap_context[device_id]; } delete rmm_wrap_context ; @@ -212,8 +216,8 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success uint32_t device_id, // GPU device id, for cudaSetDevice RMM_MODE mode, // TODO: describe. Should we default this? size_t init_pool_size, // TODO: describe. Should we default this? - size_t max_pool_size, // TODO: describe. Should we default this? - size_t stream_pool_size // TODO: describe. Should we default this? + size_t max_pool_size // TODO: describe. Should we default this? +// , size_t stream_pool_size // TODO: describe. Should we default this? ) { @@ -229,12 +233,14 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success return (-1) ; } +#if 0 if(stream_pool_size <= 0) { // std::cout << "Stream pool size must be >=0" << std::endl; // failed to create the alloc_map return (-1) ; } +#endif RMM_WRAP_CHECK_CUDA (cudaSetDevice (device_id)) ; @@ -249,10 +255,12 @@ int rmm_wrap_initialize // returns -1 on error, 0 on success // Construct a resource that uses a coalescing best-fit pool allocator //---------------------------------------------------------------------- +#if 0 // Set CUDA stream pool // std::cout << "Creating rmm_wrap stream pool" << std::endl; rmm_wrap_context[device_id]->stream_pool = make_and_set_cuda_stream_pool(stream_pool_size); RMM_WRAP_CHECK_CUDA(cudaStreamCreate(&(rmm_wrap_context[device_id]->main_stream))); +#endif if (mode == rmm_wrap_host ) { @@ -318,8 +326,8 @@ int rmm_wrap_initialize_all_same ( RMM_MODE mode, // TODO: describe. Should we default this? size_t init_pool_size, // TODO: describe. Should we default this? - size_t max_pool_size, // TODO: describe. Should we default this? - size_t stream_pool_size // TODO: describe. Should we default this? + size_t max_pool_size // TODO: describe. Should we default this? +// , size_t stream_pool_size // TODO: describe. Should we default this? ) { try @@ -387,7 +395,7 @@ int rmm_wrap_initialize_all_same rmm_wrap_context[i] = NULL; uint32_t device_id = devices[i]; // std::cout << "Creating rmm_wrap_context for device_id " << device_id << std::endl; - int ret = rmm_wrap_initialize(device_id, mode, init_pool_size, max_pool_size, stream_pool_size); + int ret = rmm_wrap_initialize(device_id, mode, init_pool_size, max_pool_size ) ; // , stream_pool_size); if(ret < 0) { return ret; } @@ -403,6 +411,7 @@ int rmm_wrap_initialize_all_same } } +#if 0 //------------------------------------------------------------------------------ // rmm_wrap_get_next_stream_from_pool: return the next available stream from // the pool Output is cudaStream_t @@ -434,6 +443,8 @@ void* rmm_wrap_get_main_stream(void) // FIXME: check for errors return rmm_wrap_context[get_current_device()]->main_stream; } +#endif + //------------------------------------------------------------------------------ // rmm_wrap_malloc: malloc-equivalent method using RMM //------------------------------------------------------------------------------ diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.h b/GraphBLAS/rmm_wrap/rmm_wrap.h index 3a081d5a2a..7726534a2c 100644 --- a/GraphBLAS/rmm_wrap/rmm_wrap.h +++ b/GraphBLAS/rmm_wrap/rmm_wrap.h @@ -55,8 +55,8 @@ int rmm_wrap_initialize uint32_t device_id, RMM_MODE mode, size_t init_pool_size, - size_t max_pool_size, - size_t stream_pool_size + size_t max_pool_size + // , size_t stream_pool_size ) ; // initialize rmm_wrap_contexts for each device in CUDA_VISIBLE_DEVICES @@ -65,8 +65,8 @@ int rmm_wrap_initialize_all_same ( RMM_MODE mode, size_t init_pool_size, - size_t max_pool_size, - size_t stream_pool_size + size_t max_pool_size + // , size_t stream_pool_size ) ; // destroy an RMM resource @@ -83,9 +83,9 @@ void *rmm_wrap_realloc (void *p, size_t newsize) ; void rmm_wrap_free (void *p) ; // Get streams from context (based on current device_id): -void* rmm_wrap_get_next_stream_from_pool(void); -void* rmm_wrap_get_stream_from_pool(size_t stream_id); -void* rmm_wrap_get_main_stream(void); +// void* rmm_wrap_get_next_stream_from_pool(void); +// void* rmm_wrap_get_stream_from_pool(size_t stream_id); +// void* rmm_wrap_get_main_stream(void); #ifdef __cplusplus } diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.hpp b/GraphBLAS/rmm_wrap/rmm_wrap.hpp index 10ba7123c4..b293f57bde 100644 --- a/GraphBLAS/rmm_wrap/rmm_wrap.hpp +++ b/GraphBLAS/rmm_wrap/rmm_wrap.hpp @@ -24,8 +24,8 @@ #include #include #include -#include -#include +// #include +// #include #include #include #include @@ -45,6 +45,6 @@ typedef rmm::mr::pool_memory_resource managed_pool_mr; typedef std::unordered_map< std::size_t, std::size_t> alloc_map; -typedef rmm::cuda_stream_pool cuda_stream_pool; -typedef rmm::cuda_stream_view cuda_stream_view; +// typedef rmm::cuda_stream_pool cuda_stream_pool; +// typedef rmm::cuda_stream_view cuda_stream_view; diff --git a/GraphBLAS/rmm_wrap/rmm_wrap_test.c b/GraphBLAS/rmm_wrap/rmm_wrap_test.c index 32aa6b3d82..b845481387 100644 --- a/GraphBLAS/rmm_wrap/rmm_wrap_test.c +++ b/GraphBLAS/rmm_wrap/rmm_wrap_test.c @@ -16,7 +16,7 @@ int main() max_size = 256*(1ULL<<20); //printf(" pool init size %ld, max size %ld\n", init_size, max_size); - rmm_wrap_initialize_all_same( rmm_wrap_managed, init_size, max_size, stream_pool_size ); + rmm_wrap_initialize_all_same( rmm_wrap_managed, init_size, max_size /*, stream_pool_size */); printf("RMM initialized! in managed mode\n"); void *p; @@ -30,7 +30,7 @@ int main() rmm_wrap_deallocate( p, buff_size); rmm_wrap_finalize(); - rmm_wrap_initialize_all_same(rmm_wrap_device, init_size, max_size, stream_pool_size ); + rmm_wrap_initialize_all_same(rmm_wrap_device, init_size, max_size /*, stream_pool_size */); printf("RMM initialized! in device mode\n"); buff_size = (1ULL<<13)+157; diff --git a/README.md b/README.md index 29bb520fc5..75eda02f13 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ SuiteSparse: A Suite of Sparse matrix packages at http://suitesparse.com ----------------------------------------------------------------------------- -Nov 4, 2025, SuiteSparse VERSION 7.12.1 +Feb 5, 2026, SuiteSparse VERSION 7.12.2 SuiteSparse is a set of sparse-matrix-related packages written or co-authored by Tim Davis, available at https://github.com/DrTimothyAldenDavis/SuiteSparse . diff --git a/SuiteSparse_config/CMakeLists.txt b/SuiteSparse_config/CMakeLists.txt index 7ec5ce1b65..c53ec6621d 100644 --- a/SuiteSparse_config/CMakeLists.txt +++ b/SuiteSparse_config/CMakeLists.txt @@ -15,10 +15,10 @@ cmake_minimum_required ( VERSION 3.22 ) # version of both SuiteSparse and SuiteSparse_config -set ( SUITESPARSE_DATE "Nov 4, 2025" ) +set ( SUITESPARSE_DATE "Feb 5, 2026" ) set ( SUITESPARSE_VERSION_MAJOR 7 ) set ( SUITESPARSE_VERSION_MINOR 12 ) -set ( SUITESPARSE_VERSION_SUB 1 ) +set ( SUITESPARSE_VERSION_SUB 2 ) set ( SUITESPARSE_CONFIG_VERSION_MAJOR ${SUITESPARSE_VERSION_MAJOR} CACHE STRING "" FORCE ) set ( SUITESPARSE_CONFIG_VERSION_MINOR ${SUITESPARSE_VERSION_MINOR} CACHE STRING "" FORCE ) set ( SUITESPARSE_CONFIG_VERSION_PATCH ${SUITESPARSE_VERSION_SUB} CACHE STRING "" FORCE ) diff --git a/SuiteSparse_config/Config/SuiteSparse_config.h.in b/SuiteSparse_config/Config/SuiteSparse_config.h.in index ff2eb40180..6e06022bf8 100644 --- a/SuiteSparse_config/Config/SuiteSparse_config.h.in +++ b/SuiteSparse_config/Config/SuiteSparse_config.h.in @@ -589,7 +589,7 @@ int SuiteSparse_version // returns SUITESPARSE_VERSION ok = ok && ((sizeof (K) >= sizeof (k)) || ((int64_t)(K) == (int64_t)(k))) ; //------------------------------------------------------------------------------ -// SUITESPARSE_BLAS_SUFFIX: modify the name of a Fortran BLAS/LAPACK routine +// BLAS64_SUFFIX: modify the name of a Fortran BLAS/LAPACK routine //------------------------------------------------------------------------------ // OpenBLAS can be compiled by appending a suffix to each routine, so that the @@ -606,6 +606,10 @@ int SuiteSparse_version // returns SUITESPARSE_VERSION // cd build && cmake -DBLAS64_SUFFIX="_64" .. +// Fedora compiles OpenBLAS with the "64_" suffix and should use: + +// cd build && cmake -DBLAS64_SUFFIX="64_" .. + // This setting could be used by the spack packaging of SuiteSparse when linked // with the spack-installed OpenBLAS with 64-bit integers. See // https://github.com/spack/spack/blob/develop/var/spack/repos/builtin/packages/suite-sparse/package.py diff --git a/SuiteSparse_config/SuiteSparse_config.h b/SuiteSparse_config/SuiteSparse_config.h index a5f5ef45d8..68abaeda08 100644 --- a/SuiteSparse_config/SuiteSparse_config.h +++ b/SuiteSparse_config/SuiteSparse_config.h @@ -449,10 +449,10 @@ int SuiteSparse_version // returns SUITESPARSE_VERSION #define SUITESPARSE_HAS_VERSION_FUNCTION -#define SUITESPARSE_DATE "Nov 4, 2025" +#define SUITESPARSE_DATE "Feb 5, 2026" #define SUITESPARSE_MAIN_VERSION 7 #define SUITESPARSE_SUB_VERSION 12 -#define SUITESPARSE_SUBSUB_VERSION 1 +#define SUITESPARSE_SUBSUB_VERSION 2 // version format x.y #define SUITESPARSE_VER_CODE(main,sub) ((main) * 1000 + (sub)) @@ -461,7 +461,7 @@ int SuiteSparse_version // returns SUITESPARSE_VERSION // version format x.y.z #define SUITESPARSE__VERCODE(main,sub,patch) \ (((main)*1000ULL + (sub))*1000ULL + (patch)) -#define SUITESPARSE__VERSION SUITESPARSE__VERCODE(7,12,1) +#define SUITESPARSE__VERSION SUITESPARSE__VERCODE(7,12,2) //============================================================================== // SuiteSparse interface to the BLAS and LAPACK libraries @@ -589,7 +589,7 @@ int SuiteSparse_version // returns SUITESPARSE_VERSION ok = ok && ((sizeof (K) >= sizeof (k)) || ((int64_t)(K) == (int64_t)(k))) ; //------------------------------------------------------------------------------ -// SUITESPARSE_BLAS_SUFFIX: modify the name of a Fortran BLAS/LAPACK routine +// BLAS64_SUFFIX: modify the name of a Fortran BLAS/LAPACK routine //------------------------------------------------------------------------------ // OpenBLAS can be compiled by appending a suffix to each routine, so that the @@ -606,6 +606,10 @@ int SuiteSparse_version // returns SUITESPARSE_VERSION // cd build && cmake -DBLAS64_SUFFIX="_64" .. +// Fedora compiles OpenBLAS with the "64_" suffix and should use: + +// cd build && cmake -DBLAS64_SUFFIX="64_" .. + // This setting could be used by the spack packaging of SuiteSparse when linked // with the spack-installed OpenBLAS with 64-bit integers. See // https://github.com/spack/spack/blob/develop/var/spack/repos/builtin/packages/suite-sparse/package.py diff --git a/SuiteSparse_config/cmake_modules/SuiteSparseBLAS.cmake b/SuiteSparse_config/cmake_modules/SuiteSparseBLAS.cmake index b748ceb06a..a07ad1e2b6 100644 --- a/SuiteSparse_config/cmake_modules/SuiteSparseBLAS.cmake +++ b/SuiteSparse_config/cmake_modules/SuiteSparseBLAS.cmake @@ -11,11 +11,43 @@ # SuiteSparse interface to the Fortran BLAS library. # cmake 3.22 is required because BLA_SIZEOF_INTEGER is used. +cmake_minimum_required ( VERSION 3.22 ) + # The Intel MKL BLAS is highly recommended. It is free to download (but be # sure to check their license to make sure you accept it). See: -# https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.htm - -cmake_minimum_required ( VERSION 3.22 ) +# https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.htm . +# It includes an mkl_set_num_threads_local that ParU can use for best +# performance. + +# Otherwise, the OpenBLAS also gives excellent performance. The main downside +# of OpenBLAS is that its header (openblas_config.h) does not have an OpenBLAS +# version that can be used for conditional compilation. If OpenBLAS is used, +# v0.2.14 or later is required, and v0.3.27 or later is most desirable. +# OpenBLAS v0.3.27 and later includes openblas_set_num_threads_local. If this +# method is available, ParU uses it to obtain better performance, as compared +# to BLAS packages that do not have this option. + +# No other BLAS packages include a method to control the number of threads on a +# per-call basis to the BLAS. As a result, this script prioritizes Intel MKL +# and OpenBLAS over other BLAS libraries. + +# BLA_VENDOR may be set by the user before this script runs (see the comments +# below on how to look for a specific BLAS library); in this case, only that +# particular BLAS package is searched for, and if not found, it results in an +# error. + +# If BLA_VENDOR is not set by the user, then this scripts sets it to one value +# at a time, and searches for that particular BLAS library package. If not +# found, this script keeps looking, until it gives up looking for specific BLAS +# libraries and simply unsets BLA_VENDOR to find any BLAS library. + +# BLA_VENDOR is only any input parameter to find_package ( BLAS ), not +# an output. However, on output, this script sets BLA_VENDOR to the specific +# BLAS that was found, with one exception: if an unknown or generic BLAS +# library is found, then BLA_VENDOR is set to GENERIC. The BLA_VENDOR variable +# is then used by SuiteSparse__blas_threading.cmake, SuiteSparseBLAS32.cmake, +# SuiteSparseBLAS64.cmake, SuiteSparseLAPACK.cmake, to set other compile-time +# definitions based on this final value of BLA_VENDOR. # To select a specific BLAS: set to the BLA_VENDOR options from FindBLAS.cmake if ( DEFINED ENV{BLA_VENDOR} ) @@ -121,9 +153,9 @@ if ( SUITESPARSE_USE_64BIT_BLAS ) return ( ) endif ( ) - # Look for ARM BLAS with 64-bit integers - message ( STATUS "Looking for ARM 64-bit BLAS" ) - set ( BLA_VENDOR Arm_ilp64_mp ) + # Look for OpenBLAS with 64-bit integers + message ( STATUS "Looking for 64-bit OpenBLAS" ) + set ( BLA_VENDOR OpenBLAS ) set ( BLA_SIZEOF_INTEGER 8 ) find_package ( BLAS ) if ( BLAS_FOUND ) @@ -131,9 +163,9 @@ if ( SUITESPARSE_USE_64BIT_BLAS ) return ( ) endif ( ) - # Look for IBM BLAS with 64-bit integers - message ( STATUS "Looking for IBM ESSL 64-bit BLAS" ) - set ( BLA_VENDOR IBMESSL_SMP ) + # Look for ARM BLAS with 64-bit integers + message ( STATUS "Looking for ARM 64-bit BLAS" ) + set ( BLA_VENDOR Arm_ilp64_mp ) set ( BLA_SIZEOF_INTEGER 8 ) find_package ( BLAS ) if ( BLAS_FOUND ) @@ -141,9 +173,9 @@ if ( SUITESPARSE_USE_64BIT_BLAS ) return ( ) endif ( ) - # Look for OpenBLAS with 64-bit integers - message ( STATUS "Looking for 64-bit OpenBLAS" ) - set ( BLA_VENDOR OpenBLAS ) + # Look for IBM BLAS with 64-bit integers + message ( STATUS "Looking for IBM ESSL 64-bit BLAS" ) + set ( BLA_VENDOR IBMESSL_SMP ) set ( BLA_SIZEOF_INTEGER 8 ) find_package ( BLAS ) if ( BLAS_FOUND ) @@ -158,6 +190,7 @@ if ( SUITESPARSE_USE_64BIT_BLAS ) find_package ( BLAS ) if ( BLAS_FOUND ) include ( SuiteSparseBLAS64 ) + set ( BLA_VENDOR Generic ) return ( ) endif ( ) @@ -182,6 +215,16 @@ if ( BLAS_FOUND ) return ( ) endif ( ) +# Look for OpenBLAS with 32-bit integers +message ( STATUS "Looking for 32-bit OpenBLAS" ) +set ( BLA_VENDOR OpenBLAS ) +set ( BLA_SIZEOF_INTEGER 4 ) +find_package ( BLAS ) +if ( BLAS_FOUND ) + include ( SuiteSparseBLAS32 ) + return ( ) +endif ( ) + # Look for Apple Accelerate Framework (32-bit only) message ( STATUS "Looking for 32-bit Apple BLAS" ) set ( BLA_VENDOR Apple ) @@ -212,16 +255,6 @@ if ( BLAS_FOUND ) return ( ) endif ( ) -# Look for OpenBLAS with 32-bit integers -message ( STATUS "Looking for 32-bit OpenBLAS" ) -set ( BLA_VENDOR OpenBLAS ) -set ( BLA_SIZEOF_INTEGER 4 ) -find_package ( BLAS ) -if ( BLAS_FOUND ) - include ( SuiteSparseBLAS32 ) - return ( ) -endif ( ) - # Look for FLAME BLAS(32-bit only) message ( STATUS "Looking for 32-bit FLAME (BLIS) BLAS" ) set ( BLA_VENDOR FLAME ) @@ -242,4 +275,5 @@ message ( STATUS "Looking for any 32-bit BLAS" ) set ( BLA_SIZEOF_INTEGER 4 ) find_package ( BLAS REQUIRED ) include ( SuiteSparseBLAS32 ) +set ( BLA_VENDOR Generic )