From f602cf2162f581f8e6a576b433c9fd8222beb32a Mon Sep 17 00:00:00 2001 From: Mike Vogt Date: Mon, 16 Mar 2026 14:42:20 +0100 Subject: [PATCH 1/2] fixed some issues in the cmakelists, now it compiles both cusolverrf and cusolverrfbatch. Also fixed some issues in the both src files, since they used the api incorrectly and used a pointer of a pointer. And upgraded helper_cuda.h to reflect more modern architectures. --- CMakeLists.txt | 49 +++- Makefile | 632 ++++++++++++++++++++------------------------ cuSolverRf.cpp | 12 +- cuSolverRfBatch.cpp | 23 +- inc/helper_cuda.h | 3 +- 5 files changed, 359 insertions(+), 360 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d2df67..1108d0f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,28 +5,65 @@ if (UNIX) endif (UNIX) set(PROJECT_NAME cuSolverRfBatch) -project (${PROJECT_NAME} LANGUAGES C CXX CUDA) -set(SRC_FILES +project(cuSolverRfBatch LANGUAGES C CXX CUDA) + +# CUDA architecture configuration +set(CMAKE_CUDA_ARCHITECTURES 75 80 86 90) +find_package(CUDAToolkit REQUIRED) + +set(SHARED_SOURCES mmio_wrapper.cpp - cuSolverRfBatch.cpp ) -add_executable(${PROJECT_NAME} ${SRC_FILES}) add_library(mmio mmio.c) +add_executable(cuSolverRf + cuSolverRf.cpp + ${SHARED_SOURCES} +) + +add_executable(cuSolverRfBatch + cuSolverRfBatch.cpp + ${SHARED_SOURCES} +) + # CUDA +#set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) + +#find_package(CUDA REQUIRED) +#include_directories("${CUDA_INCLUDE_DIRS}") + find_library(CUDA_LIBRARY_CUDART cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) find_library(CUDA_LIBRARY_CUSOLVER cusolver ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) find_library(CUDA_LIBRARY_CUSPARSE cusparse ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + +target_include_directories( + cuSolverRf + PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + "./inc/" +) + target_include_directories( - ${PROJECT_NAME} + cuSolverRfBatch PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "./inc/" ) + +target_link_libraries( + cuSolverRfBatch + ${CUDA_LIBRARY_CUDART} + ${CUDA_LIBRARY_CUSOLVER} + ${CUDA_LIBRARY_CUSPARSE} + mmio +) + target_link_libraries( - ${PROJECT_NAME} + cuSolverRf ${CUDA_LIBRARY_CUDART} ${CUDA_LIBRARY_CUSOLVER} ${CUDA_LIBRARY_CUSPARSE} diff --git a/Makefile b/Makefile index e44221d..0b998ba 100755 --- a/Makefile +++ b/Makefile @@ -1,345 +1,293 @@ -################################################################################ -# -# Copyright 1993-2015 NVIDIA Corporation. All rights reserved. -# -# NOTICE TO USER: -# -# This source code is subject to NVIDIA ownership rights under U.S. and -# international Copyright laws. -# -# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -# OR PERFORMANCE OF THIS SOURCE CODE. -# -# U.S. Government End Users. This source code is a "commercial item" as -# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -# "commercial computer software" and "commercial computer software -# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -# and is provided to the U.S. Government only as a commercial end item. -# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -# source code with only those rights set forth herein. -# -################################################################################ -# -# Makefile project only supported on Mac OS X and Linux Platforms) -# -################################################################################ - -# Location of the CUDA Toolkit -CUDA_PATH ?= /usr/local/cuda - -############################## -# start deprecated interface # -############################## -ifeq ($(x86_64),1) - $(info WARNING - x86_64 variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=x86_64 instead) - TARGET_ARCH ?= x86_64 -endif -ifeq ($(ARMv7),1) - $(info WARNING - ARMv7 variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=armv7l instead) - TARGET_ARCH ?= armv7l -endif -ifeq ($(aarch64),1) - $(info WARNING - aarch64 variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=aarch64 instead) - TARGET_ARCH ?= aarch64 -endif -ifeq ($(ppc64le),1) - $(info WARNING - ppc64le variable has been deprecated) - $(info WARNING - please use TARGET_ARCH=ppc64le instead) - TARGET_ARCH ?= ppc64le -endif -ifneq ($(GCC),) - $(info WARNING - GCC variable has been deprecated) - $(info WARNING - please use HOST_COMPILER=$(GCC) instead) - HOST_COMPILER ?= $(GCC) -endif -ifneq ($(abi),) - $(error ERROR - abi variable has been removed) -endif -############################ -# end deprecated interface # -############################ - -# architecture -HOST_ARCH := $(shell uname -m) -TARGET_ARCH ?= $(HOST_ARCH) -ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) - ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) - TARGET_SIZE := 64 - else ifneq (,$(filter $(TARGET_ARCH),armv7l)) - TARGET_SIZE := 32 - endif - else - TARGET_SIZE := $(shell getconf LONG_BIT) - endif -else - $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) -endif - -# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. -ifeq ($(HOST_ARCH),aarch64) - ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null)) - HOST_ARCH := sbsa - TARGET_ARCH := sbsa - endif -endif - -ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) - $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) - endif -endif - -# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l -ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) - TARGET_ARCH = armv7l -endif - -# operating system -HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") -TARGET_OS ?= $(HOST_OS) -ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) - $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) -endif - -# host compiler -ifeq ($(TARGET_OS),darwin) - ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) - HOST_COMPILER ?= clang++ - endif -else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) - ifeq ($(TARGET_OS),linux) - HOST_COMPILER ?= arm-linux-gnueabihf-g++ - else ifeq ($(TARGET_OS),qnx) - ifeq ($(QNX_HOST),) - $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) - endif - ifeq ($(QNX_TARGET),) - $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) - endif - export QNX_HOST - export QNX_TARGET - HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ - else ifeq ($(TARGET_OS),android) - HOST_COMPILER ?= arm-linux-androideabi-g++ - endif - else ifeq ($(TARGET_ARCH),aarch64) - ifeq ($(TARGET_OS), linux) - HOST_COMPILER ?= aarch64-linux-gnu-g++ - else ifeq ($(TARGET_OS),qnx) - ifeq ($(QNX_HOST),) - $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) - endif - ifeq ($(QNX_TARGET),) - $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) - endif - export QNX_HOST - export QNX_TARGET - HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++ - else ifeq ($(TARGET_OS), android) - HOST_COMPILER ?= aarch64-linux-android-clang++ - endif - else ifeq ($(TARGET_ARCH),sbsa) - HOST_COMPILER ?= aarch64-linux-gnu-g++ - else ifeq ($(TARGET_ARCH),ppc64le) - HOST_COMPILER ?= powerpc64le-linux-gnu-g++ - endif -endif -HOST_COMPILER ?= g++ -NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) - -# internal flags -NVCCFLAGS := -m${TARGET_SIZE} -CCFLAGS := -LDFLAGS := - -# build flags -ifeq ($(TARGET_OS),darwin) - LDFLAGS += -rpath $(CUDA_PATH)/lib - CCFLAGS += -arch $(HOST_ARCH) -else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) - LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 - CCFLAGS += -mfloat-abi=hard -else ifeq ($(TARGET_OS),android) - LDFLAGS += -pie - CCFLAGS += -fpie -fpic -fexceptions -endif - -ifneq ($(TARGET_ARCH),$(HOST_ARCH)) - ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) - ifneq ($(TARGET_FS),) - GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) - ifeq ($(GCCVERSIONLTEQ46),1) - CCFLAGS += --sysroot=$(TARGET_FS) - endif - LDFLAGS += --sysroot=$(TARGET_FS) - LDFLAGS += -rpath-link=$(TARGET_FS)/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf - endif - endif - ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) - ifneq ($(TARGET_FS),) - GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) - ifeq ($(GCCVERSIONLTEQ46),1) - CCFLAGS += --sysroot=$(TARGET_FS) - endif - LDFLAGS += --sysroot=$(TARGET_FS) - LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib - LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu - LDFLAGS += --unresolved-symbols=ignore-in-shared-libs - CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include - CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu - endif - endif - ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) - NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le - CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu - LDFLAGS += -lsocket - LDFLAGS += -L/usr/lib/aarch64-qnx-gnu - CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu" - ifdef TARGET_OVERRIDE - LDFLAGS += -lslog2 - endif - - ifneq ($(TARGET_FS),) - LDFLAGS += -L$(TARGET_FS)/usr/lib - CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib" - LDFLAGS += -L$(TARGET_FS)/usr/libnvidia - CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia" - endif - endif -endif - -ifdef TARGET_OVERRIDE # cuda toolkit targets override - NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) -endif - -# Install directory of different arch -CUDA_INSTALL_TARGET_DIR := -ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) - CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) - CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) - CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) - CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) - CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) - CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ -else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) - CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ -else ifeq ($(TARGET_ARCH),ppc64le) - CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ -endif - -# Debug build flags -ifeq ($(dbg),1) - NVCCFLAGS += -g -G - BUILD_TYPE := debug -else - BUILD_TYPE := release -endif - -ALL_CCFLAGS := -ALL_CCFLAGS += $(NVCCFLAGS) -ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) -ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) -ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) - -SAMPLE_ENABLED := 1 - -ifeq ($(TARGET_OS),linux) -ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" -endif - -ALL_LDFLAGS := -ALL_LDFLAGS += $(ALL_CCFLAGS) -ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) -ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) - -# Common includes and paths for CUDA -INCLUDES := -I./inc -LIBRARIES := - -################################################################################ - -# Gencode arguments -SMS ?= 35 37 50 52 60 61 70 75 80 86 - -ifeq ($(SMS),) -$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) -SAMPLE_ENABLED := 0 -endif - -ifeq ($(GENCODE_FLAGS),) -# Generate SASS code for each SM architecture listed in $(SMS) -$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) - -# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility -HIGHEST_SM := $(lastword $(sort $(SMS))) -ifneq ($(HIGHEST_SM),) -GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) -endif -endif - -LIBRARIES += -lcusolver -lcublas -lcusparse - -ifeq ($(SAMPLE_ENABLED),0) -EXEC ?= @echo "[@]" -endif - -################################################################################ - -# Target rules -all: cuSolverRf cuSolverRfBatch - -check.deps: -ifeq ($(SAMPLE_ENABLED),0) - @echo "Sample will be waived due to the above missing dependencies" -else - @echo "Sample is ready - all dependencies have been met" -endif - -cuSolverRf.o:cuSolverRf.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - -cuSolverRfBatch.o:cuSolverRfBatch.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - -mmio.c.o:mmio.c - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - -mmio_wrapper.o:mmio_wrapper.cpp - $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< - -cuSolverRf: cuSolverRf.o mmio.c.o mmio_wrapper.o - $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) - #$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - #$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) - -cuSolverRfBatch: cuSolverRfBatch.o mmio.c.o mmio_wrapper.o - $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.22 +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Disable VCS-based implicit rules. +% : %,v + +# Disable VCS-based implicit rules. +% : RCS/% + +# Disable VCS-based implicit rules. +% : RCS/%,v + +# Disable VCS-based implicit rules. +% : SCCS/s.% + +# Disable VCS-based implicit rules. +% : s.% + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Command-line flag to silence nested $(MAKE). +$(VERBOSE)MAKESILENT = -s + +#Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E rm -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /mnt/c/temp/cuSolverRf-batch-master + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /mnt/c/temp/cuSolverRf-batch-master + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/temp/cuSolverRf-batch-master/CMakeFiles /mnt/c/temp/cuSolverRf-batch-master//CMakeFiles/progress.marks + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all + $(CMAKE_COMMAND) -E cmake_progress_start /mnt/c/temp/cuSolverRf-batch-master/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f cuSolverRf cuSolverRfBatch cuSolverRf.o cuSolverRfBatch.o mmio.c.o mmio_wrapper.o - #rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuSolverRf + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +#============================================================================= +# Target rules for targets named mmio + +# Build rule for target. +mmio: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 mmio +.PHONY : mmio + +# fast build rule for target. +mmio/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/mmio.dir/build.make CMakeFiles/mmio.dir/build +.PHONY : mmio/fast + +#============================================================================= +# Target rules for targets named cuSolverRf + +# Build rule for target. +cuSolverRf: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 cuSolverRf +.PHONY : cuSolverRf + +# fast build rule for target. +cuSolverRf/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/build +.PHONY : cuSolverRf/fast + +#============================================================================= +# Target rules for targets named cuSolverRfBatch + +# Build rule for target. +cuSolverRfBatch: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 cuSolverRfBatch +.PHONY : cuSolverRfBatch + +# fast build rule for target. +cuSolverRfBatch/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/build +.PHONY : cuSolverRfBatch/fast + +cuSolverRf.o: cuSolverRf.cpp.o +.PHONY : cuSolverRf.o + +# target to build an object file +cuSolverRf.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/cuSolverRf.cpp.o +.PHONY : cuSolverRf.cpp.o + +cuSolverRf.i: cuSolverRf.cpp.i +.PHONY : cuSolverRf.i + +# target to preprocess a source file +cuSolverRf.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/cuSolverRf.cpp.i +.PHONY : cuSolverRf.cpp.i + +cuSolverRf.s: cuSolverRf.cpp.s +.PHONY : cuSolverRf.s + +# target to generate assembly for a file +cuSolverRf.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/cuSolverRf.cpp.s +.PHONY : cuSolverRf.cpp.s + +cuSolverRfBatch.o: cuSolverRfBatch.cpp.o +.PHONY : cuSolverRfBatch.o + +# target to build an object file +cuSolverRfBatch.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/cuSolverRfBatch.cpp.o +.PHONY : cuSolverRfBatch.cpp.o + +cuSolverRfBatch.i: cuSolverRfBatch.cpp.i +.PHONY : cuSolverRfBatch.i + +# target to preprocess a source file +cuSolverRfBatch.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/cuSolverRfBatch.cpp.i +.PHONY : cuSolverRfBatch.cpp.i + +cuSolverRfBatch.s: cuSolverRfBatch.cpp.s +.PHONY : cuSolverRfBatch.s + +# target to generate assembly for a file +cuSolverRfBatch.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/cuSolverRfBatch.cpp.s +.PHONY : cuSolverRfBatch.cpp.s + +mmio.o: mmio.c.o +.PHONY : mmio.o + +# target to build an object file +mmio.c.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/mmio.dir/build.make CMakeFiles/mmio.dir/mmio.c.o +.PHONY : mmio.c.o + +mmio.i: mmio.c.i +.PHONY : mmio.i + +# target to preprocess a source file +mmio.c.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/mmio.dir/build.make CMakeFiles/mmio.dir/mmio.c.i +.PHONY : mmio.c.i + +mmio.s: mmio.c.s +.PHONY : mmio.s + +# target to generate assembly for a file +mmio.c.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/mmio.dir/build.make CMakeFiles/mmio.dir/mmio.c.s +.PHONY : mmio.c.s + +mmio_wrapper.o: mmio_wrapper.cpp.o +.PHONY : mmio_wrapper.o + +# target to build an object file +mmio_wrapper.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/mmio_wrapper.cpp.o + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/mmio_wrapper.cpp.o +.PHONY : mmio_wrapper.cpp.o + +mmio_wrapper.i: mmio_wrapper.cpp.i +.PHONY : mmio_wrapper.i + +# target to preprocess a source file +mmio_wrapper.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/mmio_wrapper.cpp.i + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/mmio_wrapper.cpp.i +.PHONY : mmio_wrapper.cpp.i + +mmio_wrapper.s: mmio_wrapper.cpp.s +.PHONY : mmio_wrapper.s + +# target to generate assembly for a file +mmio_wrapper.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRf.dir/build.make CMakeFiles/cuSolverRf.dir/mmio_wrapper.cpp.s + $(MAKE) $(MAKESILENT) -f CMakeFiles/cuSolverRfBatch.dir/build.make CMakeFiles/cuSolverRfBatch.dir/mmio_wrapper.cpp.s +.PHONY : mmio_wrapper.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... rebuild_cache" + @echo "... cuSolverRf" + @echo "... cuSolverRfBatch" + @echo "... mmio" + @echo "... cuSolverRf.o" + @echo "... cuSolverRf.i" + @echo "... cuSolverRf.s" + @echo "... cuSolverRfBatch.o" + @echo "... cuSolverRfBatch.i" + @echo "... cuSolverRfBatch.s" + @echo "... mmio.o" + @echo "... mmio.i" + @echo "... mmio.s" + @echo "... mmio_wrapper.o" + @echo "... mmio_wrapper.i" + @echo "... mmio_wrapper.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system -clobber: clean diff --git a/cuSolverRf.cpp b/cuSolverRf.cpp index 9442eee..3cd2b54 100644 --- a/cuSolverRf.cpp +++ b/cuSolverRf.cpp @@ -525,12 +525,14 @@ int main (int argc, char *argv[]) checkCudaErrors(cusparseSpMV_bufferSize( cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize)); - void *buffer = NULL; - checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + void *buffer = nullptr; + if (bufferSize > 0) { + checkCudaErrors(cudaMalloc(&buffer, bufferSize)); + } checkCudaErrors(cusparseSpMV( cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, buffer)); checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); @@ -713,7 +715,7 @@ int main (int argc, char *argv[]) checkCudaErrors(cusparseSpMV( cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, buffer)); checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); @@ -743,6 +745,8 @@ int main (int argc, char *argv[]) printf(" cusolverRf refactor : %f sec\n", time_rf_refactor); printf(" cusolverRf solve : %f sec\n", time_rf_solve); + if (buffer) { checkCudaErrors(cudaFree(buffer)); } + if (cusolverRfH) { checkCudaErrors(cusolverRfDestroy(cusolverRfH)); } if (cusolverSpH) { checkCudaErrors(cusolverSpDestroy(cusolverSpH)); } if (cusparseH ) { checkCudaErrors(cusparseDestroy(cusparseH)); } diff --git a/cuSolverRfBatch.cpp b/cuSolverRfBatch.cpp index 627d7a2..82e25f3 100644 --- a/cuSolverRfBatch.cpp +++ b/cuSolverRfBatch.cpp @@ -615,9 +615,12 @@ int main (int argc, char *argv[]) checkCudaErrors(cusparseSpMV( cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); - - checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); + // &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, buffer)); + + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); x_inf = vec_norminf(colsA, h_x); r_inf = vec_norminf(rowsA, h_r); @@ -734,7 +737,8 @@ int main (int argc, char *argv[]) for (int i = 0; i < batchSize; ++i) { - h_A_array[i] = &(h_A_batch[batchSize*i]); + // h_A_array[i] = &(h_A_batch[batchSize*i]); + h_A_array[i] = h_A_batch + i*nnzA; } checkCudaErrors(cusolverRfBatchSetupHost( batchSize, @@ -805,14 +809,19 @@ int main (int argc, char *argv[]) //checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice)); for (int i=0; i < batchSize; ++i) { - checkCudaErrors(cudaMemcpy(d_r, &h_X_batch[i*colsA], sizeof(double)*rowsA, cudaMemcpyHostToDevice)); + //checkCudaErrors(cudaMemcpy(d_r, &h_X_batch[i*colsA], sizeof(double)*rowsA, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice)); // todo: cusparseSpMM checkCudaErrors(cusparseDnVecSetValues( - vecx, &d_X_batch[i*colsA])); + // vecx, &d_X_batch[i*colsA])); + vecx, d_X_batch + i*colsA)); checkCudaErrors(cusparseSpMV( cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + //&one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); + &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, buffer)); + + checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaMemcpy(h_x, &d_X_batch[i*colsA], sizeof(double)*colsA, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); diff --git a/inc/helper_cuda.h b/inc/helper_cuda.h index 374d01f..9b9a16e 100755 --- a/inc/helper_cuda.h +++ b/inc/helper_cuda.h @@ -1077,7 +1077,8 @@ inline int _ConvertSMVer2Cores(int major, int minor) { 0x61, 128}, // Pascal Generation (SM 6.1) GP10x class { 0x62, 128}, // Pascal Generation (SM 6.2) GP10x class { 0x70, 64 }, // Volta Generation (SM 7.0) GV100 class - + { 0x80, 64 }, // Ampere Generation (SM 8.0) GA100 class + { 0x86, 128}, // Ampere Generation (SM 8.6) GA10x class { -1, -1 } }; From c2134d58402ace94afeb973c46cf14fcc48145cc Mon Sep 17 00:00:00 2001 From: Mike Vogt Date: Wed, 18 Mar 2026 11:46:46 +0100 Subject: [PATCH 2/2] removed some duplications and removed an unecessary cudadevicesynchronize --- cuSolverRfBatch.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cuSolverRfBatch.cpp b/cuSolverRfBatch.cpp index 82e25f3..8346bb1 100644 --- a/cuSolverRfBatch.cpp +++ b/cuSolverRfBatch.cpp @@ -615,11 +615,8 @@ int main (int argc, char *argv[]) checkCudaErrors(cusparseSpMV( cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, - // &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, buffer)); - - checkCudaErrors(cudaDeviceSynchronize()); - + checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost)); x_inf = vec_norminf(colsA, h_x); @@ -733,7 +730,6 @@ int main (int argc, char *argv[]) printf("step 9: assemble P*A*Q = L*U \n"); start = second(); - start = second(); for (int i = 0; i < batchSize; ++i) { @@ -796,7 +792,6 @@ int main (int argc, char *argv[]) //checkCudaErrors(cudaMemcpy(d_x, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_X_batch, h_X_batch, sizeof(double)*batchSize*rowsA, cudaMemcpyHostToDevice)); - start = second(); start = second(); checkCudaErrors(cusolverRfBatchSolve(cusolverRfH, d_P, d_Q, 1, d_T, rowsA, d_X_array, rowsA));