utilForever · jeffdaily · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 23, 2026
diff --git a/Builds/CMake/CompileOptions.cmake b/Builds/CMake/CompileOptions.cmake
@@ -125,8 +125,10 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+	# Scope to the C/C++ host compiler: hipcc/clang rejects -Wno-class-memaccess
+	# as an unknown warning option under -Werror when it sees HIP source files.
 	set(DEFAULT_COMPILE_OPTIONS ${DEFAULT_COMPILE_OPTIONS}
-		-Wno-class-memaccess	# -> disable warning: error: 'void* memcpy(void*, const void*, size_t)' ... [-Werror=class-memaccess] (caused by imgui)
+		$<$<COMPILE_LANGUAGE:C,CXX>:-Wno-class-memaccess>	# -> disable warning: error: 'void* memcpy(void*, const void*, size_t)' ... [-Werror=class-memaccess] (caused by imgui)
 	)
 endif ()
 
@@ -138,6 +140,27 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	)
 endif ()
 
+# Clang on Windows fires -Wnontrivial-memcall on memset/memcpy calls in the
+# bundled Flatbuffers-generated headers (pre-existing upstream code, not the
+# port). This warning is not emitted by GCC or MSVC, so suppress it on
+# Windows+Clang only.
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND WIN32)
+	set(DEFAULT_COMPILE_OPTIONS ${DEFAULT_COMPILE_OPTIONS}
+		-Wno-nontrivial-memcall
+	)
+endif ()
+
+# hipcc/clang flags warnings nvcc does not on the existing CUDA sources. These
+# must come AFTER -Werror (clang honors a later -Wno-* over an earlier -Werror),
+# so append them at the end and scope to the HIP language only.
+if (USE_HIP)
+	set(DEFAULT_COMPILE_OPTIONS ${DEFAULT_COMPILE_OPTIONS}
+		$<$<COMPILE_LANGUAGE:HIP>:-Wno-reorder-ctor>
+		$<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-private-field>
+		$<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-variable>
+	)
+endif ()
+
 #
 # Linker options
 #

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,9 +44,32 @@ elseif (CMAKE_BUILD_TYPE MATCHES "Debug")
 endif()
 message(STATUS "CMake build type: ${CMAKE_BUILD_TYPE}")
 
-# CUDA
+# GPU backend selection
+#   USE_CUDA -> NVIDIA CUDA (default)
+#   USE_HIP  -> AMD ROCm/HIP
+# The two are mutually exclusive; enabling HIP turns CUDA off.
 option(USE_CUDA "Use CUDA features" ON)
-set(CUDA_CRT_LINKAGE "dynamic" CACHE STRING 
+option(USE_HIP "Use AMD ROCm/HIP features" OFF)
+if (USE_HIP)
+    set(USE_CUDA OFF)
+    set(CUBBYFLOW_CUDA_TO_HIP_HEADER
+        ${CMAKE_CURRENT_SOURCE_DIR}/Includes/Core/CUDA/cuda_to_hip.h)
+    # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, else auto-detects the
+    # host GPU, else errors on a no-GPU host.
+    enable_language(HIP)
+    # Force-include the compat shim on every HIP translation unit so the CUDA
+    # spelling resolves regardless of each file's own include order, and so the
+    # device-vs-host __CUDA_ARCH__ idiom is defined before any header uses it.
+    set(CMAKE_HIP_FLAGS
+        "${CMAKE_HIP_FLAGS} -include \"${CUBBYFLOW_CUDA_TO_HIP_HEADER}\"")
+    add_compile_definitions(CUBBYFLOW_USE_CUDA)
+    # The shim defines __CUDACC__ to keep the project's kernel/device guards
+    # visible, but that flips rocThrust's auto-detect to its CUDA backend (which
+    # then includes a CUDA-only CUB header). Pin Thrust to its HIP backend.
+    add_compile_definitions(THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+    message(STATUS "Using HIP: arch ${CMAKE_HIP_ARCHITECTURES}")
+endif()
+set(CUDA_CRT_LINKAGE "dynamic" CACHE STRING
                      "CUDA targets' CRT options. For 'static', use /MT or /MTd. For 'dynamic', use /MD or /MDd")
 if (USE_CUDA)
     set(CUDA_LINK_LIBRARIES_KEYWORD PUBLIC)
@@ -156,7 +179,7 @@ option(BUILD_TESTS "Build the CubbyFlow test programs" ON)
 if (BUILD_TESTS)
     add_subdirectory(Tests/ManualTests)
     add_subdirectory(Tests/UnitTests)
-    if (USE_CUDA)
+    if (USE_CUDA OR USE_HIP)
         add_subdirectory(Tests/CUDATests)
     endif()
     if (NOT BUILD_SONARCLOUD)
@@ -167,7 +190,7 @@ endif()
 
 option(BUILD_EXAMPLES "Build the CubbyFlow example programs" ON)
 if (BUILD_EXAMPLES)
-    if (USE_CUDA)
+    if (USE_CUDA OR USE_HIP)
         add_subdirectory(Examples/CUDASPHSim)
     endif()
     if (NOT BUILD_SONARCLOUD)
@@ -182,7 +205,7 @@ if (BUILD_EXAMPLES)
     endif()
 endif()
 
-if (NOT USE_CUDA AND NOT BUILD_SONARCLOUD)
+if (NOT (USE_CUDA OR USE_HIP) AND NOT BUILD_SONARCLOUD)
     add_subdirectory(Libraries/pybind11)
     if (BUILD_FROM_PIP)
         set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${DEFAULT_CMAKE_LIBRARY_OUTPUT_DIRECTORY})

diff --git a/Documents/Install.md b/Documents/Install.md
@@ -228,6 +228,21 @@ bin/UnitTests
 
 It should show all the tests are passing.
 
+### Building with GPU Acceleration
+
+CubbyFlow's CUDA SPH solvers can be built for either NVIDIA or AMD GPUs. The two backends are mutually exclusive and are selected at configure time. NVIDIA CUDA is enabled by default (`USE_CUDA=ON`), so the build instructions above already produce the CUDA solvers when a CUDA toolkit is found.
+
+To build the same solvers for AMD GPUs with ROCm/HIP instead, configure with `USE_HIP=ON` (which turns `USE_CUDA` off) and, if desired, set the target GPU architecture:
+
+```bash
+mkdir build
+cd build
+cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a
+make
+```
+
+`CMAKE_HIP_ARCHITECTURES` accepts a semicolon-separated list of AMD GPU targets (for example `gfx90a` or `gfx1100`); when omitted it defaults to `gfx90a`. A working ROCm installation with HIP is required, and `hipcc` should be on the path so CMake can enable the HIP language.
+
 ### Running Tests
 
 There are two different tests in the codebase including the unit test and manual test. For the detailed instruction on how to run those tests, please checkout the documentation page from [the project website](https://utilforever.github.io/CubbyFlow/Documentation/).

diff --git a/Examples/CUDASPHSim/CMakeLists.txt b/Examples/CUDASPHSim/CMakeLists.txt
@@ -5,7 +5,7 @@ set(target CUDASPHSim)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 # Sources
-if (USE_CUDA)
+if (USE_CUDA OR USE_HIP)
     file(GLOB_RECURSE sources
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
@@ -19,6 +19,10 @@ if (USE_CUDA)
     cuda_add_executable(${target} ${sources}
         OPTIONS ${CUDA_TARGET_OPTIONS}
     )
+elseif (USE_HIP)
+    file(GLOB_RECURSE hip_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
+    set_source_files_properties(${hip_sources} PROPERTIES LANGUAGE HIP)
+    add_executable(${target} ${sources})
 else()
     add_executable(${target} ${sources})
 endif()

diff --git a/Examples/CUDASPHSim/CUDAPCISPHSolver3Example.cu b/Examples/CUDASPHSim/CUDAPCISPHSolver3Example.cu
@@ -15,7 +15,11 @@
 #include <Core/PointGenerator/GridPointGenerator3.hpp>
 #include <Core/Utils/Serialization.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 #include <pystring/pystring.h>
 
 #include <fstream>

diff --git a/Includes/Core/CUDA/CUDAAlgorithms.hpp b/Includes/Core/CUDA/CUDAAlgorithms.hpp
@@ -19,7 +19,7 @@
 
 namespace CubbyFlow
 {
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 
 template <typename T>
 __global__ void CUDAFillKernel(T* dst, size_t n, T val)

diff --git a/Includes/Core/CUDA/CUDAArray-Impl.hpp b/Includes/Core/CUDA/CUDAArray-Impl.hpp
@@ -15,7 +15,7 @@
 
 namespace CubbyFlow
 {
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 namespace Internal
 {
 template <typename T, size_t N, size_t I>
@@ -140,7 +140,7 @@ CUDAArray<T, N>::CUDAArray(const CUDAStdArray<size_t, N>& size,
     Base::SetPtrAndSize(m_data.data(), size);
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 template <typename T, size_t N>
 template <typename... Args>
 CUDAArray<T, N>::CUDAArray(size_t nx, Args... args) : CUDAArray{}
@@ -354,7 +354,7 @@ void CUDAArray<T, N>::Fill(const T& val)
     m_data.Fill(val);
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 template <typename T, size_t N>
 void CUDAArray<T, N>::Resize(CUDAStdArray<size_t, N> newSize, const T& initVal)
 {