diff --git a/Builds/CMake/CompileOptions.cmake b/Builds/CMake/CompileOptions.cmake
index 3ff767886c..c87b74814c 100644
--- a/Builds/CMake/CompileOptions.cmake
+++ b/Builds/CMake/CompileOptions.cmake
@@ -125,8 +125,10 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+	# Scope to the C/C++ host compiler: hipcc/clang rejects -Wno-class-memaccess
+	# as an unknown warning option under -Werror when it sees HIP source files.
 	set(DEFAULT_COMPILE_OPTIONS ${DEFAULT_COMPILE_OPTIONS}
-		-Wno-class-memaccess	# -> disable warning: error: 'void* memcpy(void*, const void*, size_t)' ... [-Werror=class-memaccess] (caused by imgui)
+		$<$<COMPILE_LANGUAGE:C,CXX>:-Wno-class-memaccess>	# -> disable warning: error: 'void* memcpy(void*, const void*, size_t)' ... [-Werror=class-memaccess] (caused by imgui)
 	)
 endif ()
 
@@ -138,6 +140,27 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	)
 endif ()
 
+# Clang on Windows fires -Wnontrivial-memcall on memset/memcpy calls in the
+# bundled Flatbuffers-generated headers (pre-existing upstream code, not the
+# port). This warning is not emitted by GCC or MSVC, so suppress it on
+# Windows+Clang only.
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND WIN32)
+	set(DEFAULT_COMPILE_OPTIONS ${DEFAULT_COMPILE_OPTIONS}
+		-Wno-nontrivial-memcall
+	)
+endif ()
+
+# hipcc/clang flags warnings nvcc does not on the existing CUDA sources. These
+# must come AFTER -Werror (clang honors a later -Wno-* over an earlier -Werror),
+# so append them at the end and scope to the HIP language only.
+if (USE_HIP)
+	set(DEFAULT_COMPILE_OPTIONS ${DEFAULT_COMPILE_OPTIONS}
+		$<$<COMPILE_LANGUAGE:HIP>:-Wno-reorder-ctor>
+		$<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-private-field>
+		$<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-variable>
+	)
+endif ()
+
 #
 # Linker options
 #
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44ff9fdeaf..8b48910af8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,9 +44,32 @@ elseif (CMAKE_BUILD_TYPE MATCHES "Debug")
 endif()
 message(STATUS "CMake build type: ${CMAKE_BUILD_TYPE}")
 
-# CUDA
+# GPU backend selection
+#   USE_CUDA -> NVIDIA CUDA (default)
+#   USE_HIP  -> AMD ROCm/HIP
+# The two are mutually exclusive; enabling HIP turns CUDA off.
 option(USE_CUDA "Use CUDA features" ON)
-set(CUDA_CRT_LINKAGE "dynamic" CACHE STRING 
+option(USE_HIP "Use AMD ROCm/HIP features" OFF)
+if (USE_HIP)
+    set(USE_CUDA OFF)
+    set(CUBBYFLOW_CUDA_TO_HIP_HEADER
+        ${CMAKE_CURRENT_SOURCE_DIR}/Includes/Core/CUDA/cuda_to_hip.h)
+    # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, else auto-detects the
+    # host GPU, else errors on a no-GPU host.
+    enable_language(HIP)
+    # Force-include the compat shim on every HIP translation unit so the CUDA
+    # spelling resolves regardless of each file's own include order, and so the
+    # device-vs-host __CUDA_ARCH__ idiom is defined before any header uses it.
+    set(CMAKE_HIP_FLAGS
+        "${CMAKE_HIP_FLAGS} -include \"${CUBBYFLOW_CUDA_TO_HIP_HEADER}\"")
+    add_compile_definitions(CUBBYFLOW_USE_CUDA)
+    # The shim defines __CUDACC__ to keep the project's kernel/device guards
+    # visible, but that flips rocThrust's auto-detect to its CUDA backend (which
+    # then includes a CUDA-only CUB header). Pin Thrust to its HIP backend.
+    add_compile_definitions(THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+    message(STATUS "Using HIP: arch ${CMAKE_HIP_ARCHITECTURES}")
+endif()
+set(CUDA_CRT_LINKAGE "dynamic" CACHE STRING
                      "CUDA targets' CRT options. For 'static', use /MT or /MTd. For 'dynamic', use /MD or /MDd")
 if (USE_CUDA)
     set(CUDA_LINK_LIBRARIES_KEYWORD PUBLIC)
@@ -156,7 +179,7 @@ option(BUILD_TESTS "Build the CubbyFlow test programs" ON)
 if (BUILD_TESTS)
     add_subdirectory(Tests/ManualTests)
     add_subdirectory(Tests/UnitTests)
-    if (USE_CUDA)
+    if (USE_CUDA OR USE_HIP)
         add_subdirectory(Tests/CUDATests)
     endif()
     if (NOT BUILD_SONARCLOUD)
@@ -167,7 +190,7 @@ endif()
 
 option(BUILD_EXAMPLES "Build the CubbyFlow example programs" ON)
 if (BUILD_EXAMPLES)
-    if (USE_CUDA)
+    if (USE_CUDA OR USE_HIP)
         add_subdirectory(Examples/CUDASPHSim)
     endif()
     if (NOT BUILD_SONARCLOUD)
@@ -182,7 +205,7 @@ if (BUILD_EXAMPLES)
     endif()
 endif()
 
-if (NOT USE_CUDA AND NOT BUILD_SONARCLOUD)
+if (NOT (USE_CUDA OR USE_HIP) AND NOT BUILD_SONARCLOUD)
     add_subdirectory(Libraries/pybind11)
     if (BUILD_FROM_PIP)
         set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${DEFAULT_CMAKE_LIBRARY_OUTPUT_DIRECTORY})
diff --git a/Documents/Install.md b/Documents/Install.md
index 8ae11d7cc0..d3bf6a9187 100644
--- a/Documents/Install.md
+++ b/Documents/Install.md
@@ -228,6 +228,21 @@ bin/UnitTests
 
 It should show all the tests are passing.
 
+### Building with GPU Acceleration
+
+CubbyFlow's CUDA SPH solvers can be built for either NVIDIA or AMD GPUs. The two backends are mutually exclusive and are selected at configure time. NVIDIA CUDA is enabled by default (`USE_CUDA=ON`), so the build instructions above already produce the CUDA solvers when a CUDA toolkit is found.
+
+To build the same solvers for AMD GPUs with ROCm/HIP instead, configure with `USE_HIP=ON` (which turns `USE_CUDA` off) and, if desired, set the target GPU architecture:
+
+```bash
+mkdir build
+cd build
+cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a
+make
+```
+
+`CMAKE_HIP_ARCHITECTURES` accepts a semicolon-separated list of AMD GPU targets (for example `gfx90a` or `gfx1100`); when omitted it defaults to `gfx90a`. A working ROCm installation with HIP is required, and `hipcc` should be on the path so CMake can enable the HIP language.
+
 ### Running Tests
 
 There are two different tests in the codebase including the unit test and manual test. For the detailed instruction on how to run those tests, please checkout the documentation page from [the project website](https://utilforever.github.io/CubbyFlow/Documentation/).
diff --git a/Examples/CUDASPHSim/CMakeLists.txt b/Examples/CUDASPHSim/CMakeLists.txt
index 8c726a8967..1744a187e0 100644
--- a/Examples/CUDASPHSim/CMakeLists.txt
+++ b/Examples/CUDASPHSim/CMakeLists.txt
@@ -5,7 +5,7 @@ set(target CUDASPHSim)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 # Sources
-if (USE_CUDA)
+if (USE_CUDA OR USE_HIP)
     file(GLOB_RECURSE sources
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
@@ -19,6 +19,10 @@ if (USE_CUDA)
     cuda_add_executable(${target} ${sources}
         OPTIONS ${CUDA_TARGET_OPTIONS}
     )
+elseif (USE_HIP)
+    file(GLOB_RECURSE hip_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
+    set_source_files_properties(${hip_sources} PROPERTIES LANGUAGE HIP)
+    add_executable(${target} ${sources})
 else()
     add_executable(${target} ${sources})
 endif()
diff --git a/Examples/CUDASPHSim/CUDAPCISPHSolver3Example.cu b/Examples/CUDASPHSim/CUDAPCISPHSolver3Example.cu
index 19bfc840d6..40186f39e6 100644
--- a/Examples/CUDASPHSim/CUDAPCISPHSolver3Example.cu
+++ b/Examples/CUDASPHSim/CUDAPCISPHSolver3Example.cu
@@ -15,7 +15,11 @@
 #include <Core/PointGenerator/GridPointGenerator3.hpp>
 #include <Core/Utils/Serialization.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 #include <pystring/pystring.h>
 
 #include <fstream>
diff --git a/Includes/Core/CUDA/CUDAAlgorithms.hpp b/Includes/Core/CUDA/CUDAAlgorithms.hpp
index 9cb7bba295..36ad8caeb4 100644
--- a/Includes/Core/CUDA/CUDAAlgorithms.hpp
+++ b/Includes/Core/CUDA/CUDAAlgorithms.hpp
@@ -19,7 +19,7 @@
 
 namespace CubbyFlow
 {
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 
 template <typename T>
 __global__ void CUDAFillKernel(T* dst, size_t n, T val)
diff --git a/Includes/Core/CUDA/CUDAArray-Impl.hpp b/Includes/Core/CUDA/CUDAArray-Impl.hpp
index e9a9ead377..8eb7a3976e 100644
--- a/Includes/Core/CUDA/CUDAArray-Impl.hpp
+++ b/Includes/Core/CUDA/CUDAArray-Impl.hpp
@@ -15,7 +15,7 @@
 
 namespace CubbyFlow
 {
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 namespace Internal
 {
 template <typename T, size_t N, size_t I>
@@ -140,7 +140,7 @@ CUDAArray<T, N>::CUDAArray(const CUDAStdArray<size_t, N>& size,
     Base::SetPtrAndSize(m_data.data(), size);
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 template <typename T, size_t N>
 template <typename... Args>
 CUDAArray<T, N>::CUDAArray(size_t nx, Args... args) : CUDAArray{}
@@ -354,7 +354,7 @@ void CUDAArray<T, N>::Fill(const T& val)
     m_data.Fill(val);
 }
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 template <typename T, size_t N>
 void CUDAArray<T, N>::Resize(CUDAStdArray<size_t, N> newSize, const T& initVal)
 {
diff --git a/Includes/Core/CUDA/CUDAArrayBase-Impl.hpp b/Includes/Core/CUDA/CUDAArrayBase-Impl.hpp
index 6248711dc0..b63c470c2e 100644
--- a/Includes/Core/CUDA/CUDAArrayBase-Impl.hpp
+++ b/Includes/Core/CUDA/CUDAArrayBase-Impl.hpp
@@ -16,14 +16,14 @@
 namespace CubbyFlow
 {
 template <typename T, size_t N, typename Derived>
-size_t CUDAArrayBase<T, N, Derived>::Index(size_t i) const
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::Index(size_t i) const
 {
     return i;
 }
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-size_t CUDAArrayBase<T, N, Derived>::Index(size_t i, Args... args) const
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::Index(size_t i, Args... args) const
 {
     static_assert(sizeof...(args) == N - 1, "Invalid number of indices.");
     return i + m_size[0] * IndexInternal(1, args...);
@@ -31,53 +31,53 @@ size_t CUDAArrayBase<T, N, Derived>::Index(size_t i, Args... args) const
 
 template <typename T, size_t N, typename Derived>
 template <size_t... I>
-size_t CUDAArrayBase<T, N, Derived>::Index(
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::Index(
     const CUDAStdArray<size_t, N>& idx) const
 {
     return IndexInternal(idx, std::make_index_sequence<N>{});
 }
 
 template <typename T, size_t N, typename Derived>
-T* CUDAArrayBase<T, N, Derived>::data()
+CUBBYFLOW_CUDA_HOST_DEVICE T* CUDAArrayBase<T, N, Derived>::data()
 {
     return m_ptr;
 }
 
 template <typename T, size_t N, typename Derived>
-const T* CUDAArrayBase<T, N, Derived>::data() const
+CUBBYFLOW_CUDA_HOST_DEVICE const T* CUDAArrayBase<T, N, Derived>::data() const
 {
     return m_ptr;
 }
 
 template <typename T, size_t N, typename Derived>
-const CUDAStdArray<size_t, N>& CUDAArrayBase<T, N, Derived>::Size() const
+CUBBYFLOW_CUDA_HOST_DEVICE const CUDAStdArray<size_t, N>& CUDAArrayBase<T, N, Derived>::Size() const
 {
     return m_size;
 }
 
 template <typename T, size_t N, typename Derived>
 template <size_t M>
-std::enable_if_t<(M > 0), size_t> CUDAArrayBase<T, N, Derived>::Width() const
+CUBBYFLOW_CUDA_HOST_DEVICE std::enable_if_t<(M > 0), size_t> CUDAArrayBase<T, N, Derived>::Width() const
 {
     return m_size[0];
 }
 
 template <typename T, size_t N, typename Derived>
 template <size_t M>
-std::enable_if_t<(M > 1), size_t> CUDAArrayBase<T, N, Derived>::Height() const
+CUBBYFLOW_CUDA_HOST_DEVICE std::enable_if_t<(M > 1), size_t> CUDAArrayBase<T, N, Derived>::Height() const
 {
     return m_size[1];
 }
 
 template <typename T, size_t N, typename Derived>
 template <size_t M>
-std::enable_if_t<(M > 2), size_t> CUDAArrayBase<T, N, Derived>::Depth() const
+CUBBYFLOW_CUDA_HOST_DEVICE std::enable_if_t<(M > 2), size_t> CUDAArrayBase<T, N, Derived>::Depth() const
 {
     return m_size[2];
 }
 
 template <typename T, size_t N, typename Derived>
-size_t CUDAArrayBase<T, N, Derived>::Length() const
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::Length() const
 {
     size_t l = m_size[0];
 
@@ -89,7 +89,10 @@ size_t CUDAArrayBase<T, N, Derived>::Length() const
     return l;
 }
 
-#ifdef __CUDA_ARCH__
+// See CUDAArrayBase.hpp: under HIP both spaces are defined and resolved by
+// attribute, so emit device definitions when compiling for HIP or in nvcc's
+// device pass, and host definitions when compiling for HIP or nvcc's host pass.
+#if defined(__HIP__) || defined(__CUDA_ARCH__)
 template <typename T, size_t N, typename Derived>
 CUBBYFLOW_CUDA_DEVICE typename CUDAArrayBase<T, N, Derived>::Reference
 CUDAArrayBase<T, N, Derived>::At(size_t i)
@@ -178,23 +181,24 @@ CUDAArrayBase<T, N, Derived>::operator()(
 {
     return At(idx);
 }
-#else
+#endif
+#if defined(__HIP__) || !defined(__CUDA_ARCH__)
 template <typename T, size_t N, typename Derived>
-typename CUDAArrayBase<T, N, Derived>::HostReference
+CUBBYFLOW_CUDA_HOST typename CUDAArrayBase<T, N, Derived>::HostReference
 CUDAArrayBase<T, N, Derived>::At(size_t i)
 {
     return HostReference(m_ptr + i);
 }
 
 template <typename T, size_t N, typename Derived>
-T CUDAArrayBase<T, N, Derived>::At(size_t i) const
+CUBBYFLOW_CUDA_HOST T CUDAArrayBase<T, N, Derived>::At(size_t i) const
 {
     return (T)HostReference(m_ptr + i);
 }
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-typename CUDAArrayBase<T, N, Derived>::HostReference
+CUBBYFLOW_CUDA_HOST typename CUDAArrayBase<T, N, Derived>::HostReference
 CUDAArrayBase<T, N, Derived>::At(size_t i, Args... args)
 {
     return At(Index(i, args...));
@@ -202,40 +206,40 @@ CUDAArrayBase<T, N, Derived>::At(size_t i, Args... args)
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-T CUDAArrayBase<T, N, Derived>::At(size_t i, Args... args) const
+CUBBYFLOW_CUDA_HOST T CUDAArrayBase<T, N, Derived>::At(size_t i, Args... args) const
 {
     return At(Index(i, args...));
 }
 
 template <typename T, size_t N, typename Derived>
-typename CUDAArrayBase<T, N, Derived>::HostReference
+CUBBYFLOW_CUDA_HOST typename CUDAArrayBase<T, N, Derived>::HostReference
 CUDAArrayBase<T, N, Derived>::At(const CUDAStdArray<size_t, N>& idx)
 {
     return At(Index(idx));
 }
 
 template <typename T, size_t N, typename Derived>
-T CUDAArrayBase<T, N, Derived>::At(const CUDAStdArray<size_t, N>& idx) const
+CUBBYFLOW_CUDA_HOST T CUDAArrayBase<T, N, Derived>::At(const CUDAStdArray<size_t, N>& idx) const
 {
     return At(Index(idx));
 }
 
 template <typename T, size_t N, typename Derived>
-typename CUDAArrayBase<T, N, Derived>::HostReference
+CUBBYFLOW_CUDA_HOST typename CUDAArrayBase<T, N, Derived>::HostReference
 CUDAArrayBase<T, N, Derived>::operator[](size_t i)
 {
     return At(i);
 }
 
 template <typename T, size_t N, typename Derived>
-T CUDAArrayBase<T, N, Derived>::operator[](size_t i) const
+CUBBYFLOW_CUDA_HOST T CUDAArrayBase<T, N, Derived>::operator[](size_t i) const
 {
     return At(i);
 }
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-typename CUDAArrayBase<T, N, Derived>::HostReference
+CUBBYFLOW_CUDA_HOST typename CUDAArrayBase<T, N, Derived>::HostReference
 CUDAArrayBase<T, N, Derived>::operator()(size_t i, Args... args)
 {
     return At(i, args...);
@@ -243,20 +247,20 @@ CUDAArrayBase<T, N, Derived>::operator()(size_t i, Args... args)
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-T CUDAArrayBase<T, N, Derived>::operator()(size_t i, Args... args) const
+CUBBYFLOW_CUDA_HOST T CUDAArrayBase<T, N, Derived>::operator()(size_t i, Args... args) const
 {
     return At(i, args...);
 }
 
 template <typename T, size_t N, typename Derived>
-typename CUDAArrayBase<T, N, Derived>::HostReference
+CUBBYFLOW_CUDA_HOST typename CUDAArrayBase<T, N, Derived>::HostReference
 CUDAArrayBase<T, N, Derived>::operator()(const CUDAStdArray<size_t, N>& idx)
 {
     return At(idx);
 }
 
 template <typename T, size_t N, typename Derived>
-T CUDAArrayBase<T, N, Derived>::operator()(
+CUBBYFLOW_CUDA_HOST T CUDAArrayBase<T, N, Derived>::operator()(
     const CUDAStdArray<size_t, N>& idx) const
 {
     return At(idx);
@@ -264,25 +268,25 @@ T CUDAArrayBase<T, N, Derived>::operator()(
 #endif
 
 template <typename T, size_t N, typename Derived>
-CUDAArrayBase<T, N, Derived>::CUDAArrayBase() : m_size{}
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayBase<T, N, Derived>::CUDAArrayBase() : m_size{}
 {
     // Do nothing
 }
 
 template <typename T, size_t N, typename Derived>
-CUDAArrayBase<T, N, Derived>::CUDAArrayBase(const CUDAArrayBase& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayBase<T, N, Derived>::CUDAArrayBase(const CUDAArrayBase& other)
 {
     SetPtrAndSize(other.m_ptr, other.m_size);
 }
 
 template <typename T, size_t N, typename Derived>
-CUDAArrayBase<T, N, Derived>::CUDAArrayBase(CUDAArrayBase&& other) noexcept
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayBase<T, N, Derived>::CUDAArrayBase(CUDAArrayBase&& other) noexcept
 {
     *this = std::move(other);
 }
 
 template <typename T, size_t N, typename Derived>
-CUDAArrayBase<T, N, Derived>& CUDAArrayBase<T, N, Derived>::operator=(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayBase<T, N, Derived>& CUDAArrayBase<T, N, Derived>::operator=(
     const CUDAArrayBase& other)
 {
     SetPtrAndSize(other.m_ptr, other.m_size);
@@ -290,7 +294,7 @@ CUDAArrayBase<T, N, Derived>& CUDAArrayBase<T, N, Derived>::operator=(
 }
 
 template <typename T, size_t N, typename Derived>
-CUDAArrayBase<T, N, Derived>& CUDAArrayBase<T, N, Derived>::operator=(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayBase<T, N, Derived>& CUDAArrayBase<T, N, Derived>::operator=(
     CUDAArrayBase&& other) noexcept
 {
     SetPtrAndSize(other.m_ptr, other.m_size);
@@ -300,14 +304,14 @@ CUDAArrayBase<T, N, Derived>& CUDAArrayBase<T, N, Derived>::operator=(
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-void CUDAArrayBase<T, N, Derived>::SetPtrAndSize(Pointer ptr, size_t ni,
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayBase<T, N, Derived>::SetPtrAndSize(Pointer ptr, size_t ni,
                                                  Args... args)
 {
     SetPtrAndSize(ptr, CUDAStdArray<size_t, N>{ ni, args... });
 }
 
 template <typename T, size_t N, typename Derived>
-void CUDAArrayBase<T, N, Derived>::SetPtrAndSize(Pointer ptr,
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayBase<T, N, Derived>::SetPtrAndSize(Pointer ptr,
                                                  CUDAStdArray<size_t, N> size)
 {
     m_ptr = ptr;
@@ -315,35 +319,35 @@ void CUDAArrayBase<T, N, Derived>::SetPtrAndSize(Pointer ptr,
 }
 
 template <typename T, size_t N, typename Derived>
-void CUDAArrayBase<T, N, Derived>::SwapPtrAndSize(CUDAArrayBase& other)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayBase<T, N, Derived>::SwapPtrAndSize(CUDAArrayBase& other)
 {
     CUDASwap(m_ptr, other.m_ptr);
     CUDASwap(m_size, other.m_size);
 }
 
 template <typename T, size_t N, typename Derived>
-void CUDAArrayBase<T, N, Derived>::ClearPtrAndSize()
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayBase<T, N, Derived>::ClearPtrAndSize()
 {
     SetPtrAndSize(nullptr, CUDAStdArray<size_t, N>{});
 }
 
 template <typename T, size_t N, typename Derived>
 template <typename... Args>
-size_t CUDAArrayBase<T, N, Derived>::IndexInternal(size_t d, size_t i,
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::IndexInternal(size_t d, size_t i,
                                                    Args... args) const
 {
     return i + m_size[d] * IndexInternal(d + 1, args...);
 }
 
 template <typename T, size_t N, typename Derived>
-size_t CUDAArrayBase<T, N, Derived>::IndexInternal(size_t, size_t i) const
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::IndexInternal(size_t, size_t i) const
 {
     return i;
 }
 
 template <typename T, size_t N, typename Derived>
 template <size_t... I>
-size_t CUDAArrayBase<T, N, Derived>::IndexInternal(
+CUBBYFLOW_CUDA_HOST_DEVICE size_t CUDAArrayBase<T, N, Derived>::IndexInternal(
     const CUDAStdArray<size_t, N>& idx, std::index_sequence<I...>) const
 {
     return Index(idx[I]...);
@@ -352,4 +356,4 @@ size_t CUDAArrayBase<T, N, Derived>::IndexInternal(
 
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Includes/Core/CUDA/CUDAArrayBase.hpp b/Includes/Core/CUDA/CUDAArrayBase.hpp
index 650569ef5a..644a50110b 100644
--- a/Includes/Core/CUDA/CUDAArrayBase.hpp
+++ b/Includes/Core/CUDA/CUDAArrayBase.hpp
@@ -56,7 +56,63 @@ class CUDAArrayBase
 
     CUBBYFLOW_CUDA_HOST_DEVICE size_t Length() const;
 
-#ifdef __CUDA_ARCH__
+// Device accessors return live references into device memory; host accessors
+// return a copy-back wrapper (or a value). nvcc selects between them with
+// __CUDA_ARCH__ because it does not parse __host__ function bodies during the
+// device pass. clang (HIP) parses both spaces in both passes and resolves these
+// by the __host__/__device__ attributes instead, so under HIP both overload
+// sets must be declared at once and overload resolution picks by call context.
+#if defined(__HIP__)
+    CUBBYFLOW_CUDA_DEVICE Reference At(size_t i);
+    CUBBYFLOW_CUDA_HOST HostReference At(size_t i);
+
+    CUBBYFLOW_CUDA_DEVICE ConstReference At(size_t i) const;
+    CUBBYFLOW_CUDA_HOST ValueType At(size_t i) const;
+
+    template <typename... Args>
+    CUBBYFLOW_CUDA_DEVICE Reference At(size_t i, Args... args);
+    template <typename... Args>
+    CUBBYFLOW_CUDA_HOST HostReference At(size_t i, Args... args);
+
+    template <typename... Args>
+    CUBBYFLOW_CUDA_DEVICE ConstReference At(size_t i, Args... args) const;
+    template <typename... Args>
+    CUBBYFLOW_CUDA_HOST ValueType At(size_t i, Args... args) const;
+
+    CUBBYFLOW_CUDA_DEVICE Reference At(const CUDAStdArray<size_t, N>& idx);
+    CUBBYFLOW_CUDA_HOST HostReference At(const CUDAStdArray<size_t, N>& idx);
+
+    CUBBYFLOW_CUDA_DEVICE ConstReference
+    At(const CUDAStdArray<size_t, N>& idx) const;
+    CUBBYFLOW_CUDA_HOST ValueType At(const CUDAStdArray<size_t, N>& idx) const;
+
+    CUBBYFLOW_CUDA_DEVICE Reference operator[](size_t i);
+    CUBBYFLOW_CUDA_HOST HostReference operator[](size_t i);
+
+    CUBBYFLOW_CUDA_DEVICE ConstReference operator[](size_t i) const;
+    CUBBYFLOW_CUDA_HOST ValueType operator[](size_t i) const;
+
+    template <typename... Args>
+    CUBBYFLOW_CUDA_DEVICE Reference operator()(size_t i, Args... args);
+    template <typename... Args>
+    CUBBYFLOW_CUDA_HOST HostReference operator()(size_t i, Args... args);
+
+    template <typename... Args>
+    CUBBYFLOW_CUDA_DEVICE ConstReference operator()(size_t i,
+                                                    Args... args) const;
+    template <typename... Args>
+    CUBBYFLOW_CUDA_HOST ValueType operator()(size_t i, Args... args) const;
+
+    CUBBYFLOW_CUDA_DEVICE Reference
+    operator()(const CUDAStdArray<size_t, N>& idx);
+    CUBBYFLOW_CUDA_HOST HostReference
+    operator()(const CUDAStdArray<size_t, N>& idx);
+
+    CUBBYFLOW_CUDA_DEVICE ConstReference
+    operator()(const CUDAStdArray<size_t, N>& idx) const;
+    CUBBYFLOW_CUDA_HOST ValueType
+    operator()(const CUDAStdArray<size_t, N>& idx) const;
+#elif defined(__CUDA_ARCH__)
     CUBBYFLOW_CUDA_DEVICE Reference At(size_t i);
 
     CUBBYFLOW_CUDA_DEVICE ConstReference At(size_t i) const;
diff --git a/Includes/Core/CUDA/CUDAArrayView-Impl.hpp b/Includes/Core/CUDA/CUDAArrayView-Impl.hpp
index 94e92971c6..ae17f8a2a7 100644
--- a/Includes/Core/CUDA/CUDAArrayView-Impl.hpp
+++ b/Includes/Core/CUDA/CUDAArrayView-Impl.hpp
@@ -16,13 +16,13 @@
 namespace CubbyFlow
 {
 template <typename T, size_t N>
-CUDAArrayView<T, N>::CUDAArrayView() : Base()
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>::CUDAArrayView() : Base()
 {
     // Do nothing
 }
 
 template <typename T, size_t N>
-CUDAArrayView<T, N>::CUDAArrayView(T* ptr, const CUDAStdArray<size_t, N>& size)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>::CUDAArrayView(T* ptr, const CUDAStdArray<size_t, N>& size)
     : CUDAArrayView{}
 {
     Base::SetPtrAndSize(ptr, size);
@@ -30,7 +30,7 @@ CUDAArrayView<T, N>::CUDAArrayView(T* ptr, const CUDAStdArray<size_t, N>& size)
 
 template <typename T, size_t N>
 template <size_t M>
-CUDAArrayView<T, N>::CUDAArrayView(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>::CUDAArrayView(
     typename std::enable_if<(M == 1), T>::type* ptr, size_t size)
     : CUDAArrayView(ptr, CUDAStdArray<size_t, N>{ size })
 {
@@ -38,33 +38,33 @@ CUDAArrayView<T, N>::CUDAArrayView(
 }
 
 template <typename T, size_t N>
-CUDAArrayView<T, N>::CUDAArrayView(CUDAArray<T, N>& other) : CUDAArrayView{}
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>::CUDAArrayView(CUDAArray<T, N>& other) : CUDAArrayView{}
 {
     Set(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<T, N>::CUDAArrayView(const CUDAArrayView& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>::CUDAArrayView(const CUDAArrayView& other)
 {
     Set(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<T, N>::CUDAArrayView(CUDAArrayView&& other) noexcept
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>::CUDAArrayView(CUDAArrayView&& other) noexcept
     : CUDAArrayView{}
 {
     *this = std::move(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<T, N>& CUDAArrayView<T, N>::operator=(const CUDAArrayView& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>& CUDAArrayView<T, N>::operator=(const CUDAArrayView& other)
 {
     Set(other);
     return *this;
 }
 
 template <typename T, size_t N>
-CUDAArrayView<T, N>& CUDAArrayView<T, N>::operator=(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<T, N>& CUDAArrayView<T, N>::operator=(
     CUDAArrayView&& other) noexcept
 {
     Base::SetPtrAndSize(other.data(), other.Size());
@@ -73,31 +73,31 @@ CUDAArrayView<T, N>& CUDAArrayView<T, N>::operator=(
 }
 
 template <typename T, size_t N>
-void CUDAArrayView<T, N>::Set(CUDAArray<T, N>& other)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayView<T, N>::Set(CUDAArray<T, N>& other)
 {
     Base::SetPtrAndSize(other.data(), other.Size());
 }
 
 template <typename T, size_t N>
-void CUDAArrayView<T, N>::Set(const CUDAArrayView& other)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayView<T, N>::Set(const CUDAArrayView& other)
 {
     Base::SetPtrAndSize(const_cast<T*>(other.data()), other.Size());
 }
 
 template <typename T, size_t N>
-void CUDAArrayView<T, N>::Fill(const T& val)
+CUBBYFLOW_CUDA_HOST void CUDAArrayView<T, N>::Fill(const T& val)
 {
     CUDAFill(data(), val);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>::CUDAArrayView() : Base()
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView() : Base()
 {
     // Do nothing
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>::CUDAArrayView(const T* ptr,
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView(const T* ptr,
                                          const CUDAStdArray<size_t, N>& size)
     : CUDAArrayView{}
 {
@@ -106,7 +106,7 @@ CUDAArrayView<const T, N>::CUDAArrayView(const T* ptr,
 
 template <typename T, size_t N>
 template <size_t M>
-CUDAArrayView<const T, N>::CUDAArrayView(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView(
     const typename std::enable_if<(M == 1), T>::type* ptr, size_t size)
     : CUDAArrayView(ptr, CUDAStdArray<size_t, N>{ size })
 {
@@ -114,33 +114,33 @@ CUDAArrayView<const T, N>::CUDAArrayView(
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>::CUDAArrayView(const CUDAArray<T, N>& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView(const CUDAArray<T, N>& other)
     : CUDAArrayView{}
 {
     Set(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>::CUDAArrayView(const CUDAArrayView<T, N>& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView(const CUDAArrayView<T, N>& other)
 {
     Set(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>::CUDAArrayView(const CUDAArrayView& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView(const CUDAArrayView& other)
 {
     Set(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>::CUDAArrayView(CUDAArrayView&& other) noexcept
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>::CUDAArrayView(CUDAArrayView&& other) noexcept
     : CUDAArrayView{}
 {
     *this = std::move(other);
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
     const CUDAArrayView<T, N>& other)
 {
     Set(other);
@@ -148,7 +148,7 @@ CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
     const CUDAArrayView& other)
 {
     Set(other);
@@ -156,7 +156,7 @@ CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
 }
 
 template <typename T, size_t N>
-CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
     CUDAArrayView&& other) noexcept
 {
     Base::SetPtrAndSize(other.data(), other.Size());
@@ -165,19 +165,19 @@ CUDAArrayView<const T, N>& CUDAArrayView<const T, N>::operator=(
 }
 
 template <typename T, size_t N>
-void CUDAArrayView<const T, N>::Set(const CUDAArray<T, N>& other)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayView<const T, N>::Set(const CUDAArray<T, N>& other)
 {
     Base::SetPtrAndSize(other.data(), other.Size());
 }
 
 template <typename T, size_t N>
-void CUDAArrayView<const T, N>::Set(const CUDAArrayView<T, N>& other)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayView<const T, N>::Set(const CUDAArrayView<T, N>& other)
 {
     Base::SetPtrAndSize(other.data(), other.Size());
 }
 
 template <typename T, size_t N>
-void CUDAArrayView<const T, N>::Set(const CUDAArrayView& other)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAArrayView<const T, N>::Set(const CUDAArrayView& other)
 {
     Base::SetPtrAndSize(other.data(), other.Size());
 }
@@ -185,4 +185,4 @@ void CUDAArrayView<const T, N>::Set(const CUDAArrayView& other)
 
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Includes/Core/CUDA/CUDAParticleSystemData2.hpp b/Includes/Core/CUDA/CUDAParticleSystemData2.hpp
index f81cae2dff..de5cf2157d 100644
--- a/Includes/Core/CUDA/CUDAParticleSystemData2.hpp
+++ b/Includes/Core/CUDA/CUDAParticleSystemData2.hpp
@@ -18,7 +18,11 @@
 #include <Core/CUDA/CUDAPointHashGridSearcher2.hpp>
 #include <Core/Matrix/Matrix.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace CubbyFlow
 {
diff --git a/Includes/Core/CUDA/CUDAParticleSystemData3.hpp b/Includes/Core/CUDA/CUDAParticleSystemData3.hpp
index 06c7d3c3fe..b6f951a0ac 100644
--- a/Includes/Core/CUDA/CUDAParticleSystemData3.hpp
+++ b/Includes/Core/CUDA/CUDAParticleSystemData3.hpp
@@ -18,7 +18,11 @@
 #include <Core/CUDA/CUDAPointHashGridSearcher3.hpp>
 #include <Core/Matrix/Matrix.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace CubbyFlow
 {
diff --git a/Includes/Core/CUDA/CUDAPointHashGridSearcher2.hpp b/Includes/Core/CUDA/CUDAPointHashGridSearcher2.hpp
index 5920a5d94c..bae82ae120 100644
--- a/Includes/Core/CUDA/CUDAPointHashGridSearcher2.hpp
+++ b/Includes/Core/CUDA/CUDAPointHashGridSearcher2.hpp
@@ -16,7 +16,11 @@
 #include <Core/CUDA/CUDAArrayView.hpp>
 #include <Core/Matrix/Matrix.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace CubbyFlow
 {
diff --git a/Includes/Core/CUDA/CUDAPointHashGridSearcher3.hpp b/Includes/Core/CUDA/CUDAPointHashGridSearcher3.hpp
index 6d631fe821..1f7f5be4fe 100644
--- a/Includes/Core/CUDA/CUDAPointHashGridSearcher3.hpp
+++ b/Includes/Core/CUDA/CUDAPointHashGridSearcher3.hpp
@@ -16,7 +16,11 @@
 #include <Core/CUDA/CUDAArrayView.hpp>
 #include <Core/Matrix/Matrix.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace CubbyFlow
 {
diff --git a/Includes/Core/CUDA/CUDASPHKernels2-Impl.hpp b/Includes/Core/CUDA/CUDASPHKernels2-Impl.hpp
index c6e1f4d65f..d2c84c478b 100644
--- a/Includes/Core/CUDA/CUDASPHKernels2-Impl.hpp
+++ b/Includes/Core/CUDA/CUDASPHKernels2-Impl.hpp
@@ -18,18 +18,18 @@
 
 namespace CubbyFlow
 {
-inline CUDASPHStdKernel2::CUDASPHStdKernel2() : h(0), h2(0), h3(0), h4(0)
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHStdKernel2::CUDASPHStdKernel2() : h(0), h2(0), h3(0), h4(0)
 {
     // Do nothing
 }
 
-inline CUDASPHStdKernel2::CUDASPHStdKernel2(float kernelRadius)
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHStdKernel2::CUDASPHStdKernel2(float kernelRadius)
     : h(kernelRadius), h2(h * h), h3(h2 * h), h4(h2 * h2)
 {
     // Do nothing
 }
 
-inline float CUDASPHStdKernel2::operator()(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHStdKernel2::operator()(float distance) const
 {
     const float distanceSquared = distance * distance;
 
@@ -42,7 +42,7 @@ inline float CUDASPHStdKernel2::operator()(float distance) const
     return 4.0f / (PI_FLOAT * h2) * x * x * x;
 }
 
-inline float CUDASPHStdKernel2::FirstDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHStdKernel2::FirstDerivative(float distance) const
 {
     if (distance >= h)
     {
@@ -53,7 +53,7 @@ inline float CUDASPHStdKernel2::FirstDerivative(float distance) const
     return -24.0f * distance / (PI_FLOAT * h4) * x * x;
 }
 
-inline float CUDASPHStdKernel2::SecondDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHStdKernel2::SecondDerivative(float distance) const
 {
     float distanceSquared = distance * distance;
 
@@ -66,7 +66,7 @@ inline float CUDASPHStdKernel2::SecondDerivative(float distance) const
     return 24.0f / (PI_FLOAT * h4) * (1 - x) * (5 * x - 1);
 }
 
-inline float2 CUDASPHStdKernel2::Gradient(const float2& point) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float2 CUDASPHStdKernel2::Gradient(const float2& point) const
 {
     float dist = Length(point);
 
@@ -78,25 +78,25 @@ inline float2 CUDASPHStdKernel2::Gradient(const float2& point) const
     return make_float2(0, 0);
 }
 
-inline float2 CUDASPHStdKernel2::Gradient(float distance,
+inline CUBBYFLOW_CUDA_HOST_DEVICE float2 CUDASPHStdKernel2::Gradient(float distance,
                                           const float2& directionToCenter) const
 {
     return -FirstDerivative(distance) * directionToCenter;
 }
 
-inline CUDASPHSpikyKernel2::CUDASPHSpikyKernel2()
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHSpikyKernel2::CUDASPHSpikyKernel2()
     : h(0), h2(0), h3(0), h4(0), h5(0)
 {
     // Do nothing
 }
 
-inline CUDASPHSpikyKernel2::CUDASPHSpikyKernel2(float kernelRadius)
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHSpikyKernel2::CUDASPHSpikyKernel2(float kernelRadius)
     : h(kernelRadius), h2(h * h), h3(h2 * h), h4(h2 * h2), h5(h3 * h2)
 {
     // Do nothing
 }
 
-inline float CUDASPHSpikyKernel2::operator()(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHSpikyKernel2::operator()(float distance) const
 {
     if (distance >= h)
     {
@@ -107,7 +107,7 @@ inline float CUDASPHSpikyKernel2::operator()(float distance) const
     return 10.0f / (PI_FLOAT * h2) * x * x * x;
 }
 
-inline float CUDASPHSpikyKernel2::FirstDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHSpikyKernel2::FirstDerivative(float distance) const
 {
     if (distance >= h)
     {
@@ -118,7 +118,7 @@ inline float CUDASPHSpikyKernel2::FirstDerivative(float distance) const
     return -30.0f / (PI_FLOAT * h3) * x * x;
 }
 
-inline float CUDASPHSpikyKernel2::SecondDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHSpikyKernel2::SecondDerivative(float distance) const
 {
     if (distance >= h)
     {
@@ -129,7 +129,7 @@ inline float CUDASPHSpikyKernel2::SecondDerivative(float distance) const
     return 60.0f / (PI_FLOAT * h4) * x;
 }
 
-inline float2 CUDASPHSpikyKernel2::Gradient(const float2& point) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float2 CUDASPHSpikyKernel2::Gradient(const float2& point) const
 {
     float dist = Length(point);
 
@@ -141,7 +141,7 @@ inline float2 CUDASPHSpikyKernel2::Gradient(const float2& point) const
     return make_float2(0, 0);
 }
 
-inline float2 CUDASPHSpikyKernel2::Gradient(
+inline CUBBYFLOW_CUDA_HOST_DEVICE float2 CUDASPHSpikyKernel2::Gradient(
     float distance, const float2& directionToCenter) const
 {
     return -FirstDerivative(distance) * directionToCenter;
diff --git a/Includes/Core/CUDA/CUDASPHKernels3-Impl.hpp b/Includes/Core/CUDA/CUDASPHKernels3-Impl.hpp
index 60f4412908..09837543d3 100644
--- a/Includes/Core/CUDA/CUDASPHKernels3-Impl.hpp
+++ b/Includes/Core/CUDA/CUDASPHKernels3-Impl.hpp
@@ -18,18 +18,18 @@
 
 namespace CubbyFlow
 {
-inline CUDASPHStdKernel3::CUDASPHStdKernel3() : h(0), h2(0), h3(0), h5(0)
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHStdKernel3::CUDASPHStdKernel3() : h(0), h2(0), h3(0), h5(0)
 {
     // Do nothing
 }
 
-inline CUDASPHStdKernel3::CUDASPHStdKernel3(float kernelRadius)
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHStdKernel3::CUDASPHStdKernel3(float kernelRadius)
     : h(kernelRadius), h2(h * h), h3(h2 * h), h5(h2 * h3)
 {
     // Do nothing
 }
 
-inline float CUDASPHStdKernel3::operator()(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHStdKernel3::operator()(float distance) const
 {
     if (distance * distance >= h2)
     {
@@ -40,7 +40,7 @@ inline float CUDASPHStdKernel3::operator()(float distance) const
     return 315.0f / (64.0f * PI_FLOAT * h3) * x * x * x;
 }
 
-inline float CUDASPHStdKernel3::FirstDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHStdKernel3::FirstDerivative(float distance) const
 {
     if (distance >= h)
     {
@@ -51,7 +51,7 @@ inline float CUDASPHStdKernel3::FirstDerivative(float distance) const
     return -945.0f / (32.0f * PI_FLOAT * h5) * distance * x * x;
 }
 
-inline float CUDASPHStdKernel3::SecondDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHStdKernel3::SecondDerivative(float distance) const
 {
     if (distance * distance >= h2)
     {
@@ -62,7 +62,7 @@ inline float CUDASPHStdKernel3::SecondDerivative(float distance) const
     return 945.0f / (32.0f * PI_FLOAT * h5) * (1 - x) * (3 * x - 1);
 }
 
-inline float4 CUDASPHStdKernel3::Gradient(const float4& point) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float4 CUDASPHStdKernel3::Gradient(const float4& point) const
 {
     float dist = Length(point);
 
@@ -74,25 +74,25 @@ inline float4 CUDASPHStdKernel3::Gradient(const float4& point) const
     return make_float4(0, 0, 0, 0);
 }
 
-inline float4 CUDASPHStdKernel3::Gradient(float distance,
+inline CUBBYFLOW_CUDA_HOST_DEVICE float4 CUDASPHStdKernel3::Gradient(float distance,
                                           const float4& directionToCenter) const
 {
     return -FirstDerivative(distance) * directionToCenter;
 }
 
-inline CUDASPHSpikyKernel3::CUDASPHSpikyKernel3()
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHSpikyKernel3::CUDASPHSpikyKernel3()
     : h(0), h2(0), h3(0), h4(0), h5(0)
 {
     // Do nothing
 }
 
-inline CUDASPHSpikyKernel3::CUDASPHSpikyKernel3(float kernelRadius)
+inline CUBBYFLOW_CUDA_HOST_DEVICE CUDASPHSpikyKernel3::CUDASPHSpikyKernel3(float kernelRadius)
     : h(kernelRadius), h2(h * h), h3(h2 * h), h4(h2 * h2), h5(h3 * h2)
 {
     // Do nothing
 }
 
-inline float CUDASPHSpikyKernel3::operator()(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHSpikyKernel3::operator()(float distance) const
 {
     if (distance >= h)
     {
@@ -103,7 +103,7 @@ inline float CUDASPHSpikyKernel3::operator()(float distance) const
     return 15.0f / (PI_FLOAT * h3) * x * x * x;
 }
 
-inline float CUDASPHSpikyKernel3::FirstDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHSpikyKernel3::FirstDerivative(float distance) const
 {
     if (distance >= h)
     {
@@ -114,7 +114,7 @@ inline float CUDASPHSpikyKernel3::FirstDerivative(float distance) const
     return -45.0f / (PI_FLOAT * h4) * x * x;
 }
 
-inline float CUDASPHSpikyKernel3::SecondDerivative(float distance) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float CUDASPHSpikyKernel3::SecondDerivative(float distance) const
 {
     if (distance >= h)
     {
@@ -125,7 +125,7 @@ inline float CUDASPHSpikyKernel3::SecondDerivative(float distance) const
     return 90.0f / (PI_FLOAT * h5) * x;
 }
 
-inline float4 CUDASPHSpikyKernel3::Gradient(const float4& point) const
+inline CUBBYFLOW_CUDA_HOST_DEVICE float4 CUDASPHSpikyKernel3::Gradient(const float4& point) const
 {
     float dist = Length(point);
 
@@ -137,7 +137,7 @@ inline float4 CUDASPHSpikyKernel3::Gradient(const float4& point) const
     return make_float4(0, 0, 0, 0);
 }
 
-inline float4 CUDASPHSpikyKernel3::Gradient(
+inline CUBBYFLOW_CUDA_HOST_DEVICE float4 CUDASPHSpikyKernel3::Gradient(
     float distance, const float4& directionToCenter) const
 {
     return -FirstDerivative(distance) * directionToCenter;
diff --git a/Includes/Core/CUDA/CUDAStdArray-Impl.hpp b/Includes/Core/CUDA/CUDAStdArray-Impl.hpp
index 766ef3a32b..1472508d64 100644
--- a/Includes/Core/CUDA/CUDAStdArray-Impl.hpp
+++ b/Includes/Core/CUDA/CUDAStdArray-Impl.hpp
@@ -16,14 +16,15 @@
 namespace CubbyFlow
 {
 template <typename T, size_t N>
-CUDAStdArray<T, N>::CUDAStdArray()
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAStdArray<T, N>::CUDAStdArray()
 {
     Fill(T{});
 }
 
 template <typename T, size_t N>
 template <typename... Args>
-CUDAStdArray<T, N>::CUDAStdArray(ConstReference first, Args... rest)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAStdArray<T, N>::CUDAStdArray(ConstReference first,
+                                                            Args... rest)
 {
     static_assert(
         sizeof...(Args) == N - 1,
@@ -32,7 +33,7 @@ CUDAStdArray<T, N>::CUDAStdArray(ConstReference first, Args... rest)
 }
 
 template <typename T, size_t N>
-CUDAStdArray<T, N>::CUDAStdArray(const std::array<T, N>& other)
+CUBBYFLOW_CUDA_HOST CUDAStdArray<T, N>::CUDAStdArray(const std::array<T, N>& other)
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -41,7 +42,7 @@ CUDAStdArray<T, N>::CUDAStdArray(const std::array<T, N>& other)
 }
 
 template <typename T, size_t N>
-CUDAStdArray<T, N>::CUDAStdArray(const Vector<T, N>& other)
+CUBBYFLOW_CUDA_HOST CUDAStdArray<T, N>::CUDAStdArray(const Vector<T, N>& other)
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -50,7 +51,8 @@ CUDAStdArray<T, N>::CUDAStdArray(const Vector<T, N>& other)
 }
 
 template <typename T, size_t N>
-CUDAStdArray<T, N>::CUDAStdArray(const CUDAStdArray& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAStdArray<T, N>::CUDAStdArray(
+    const CUDAStdArray& other)
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -59,7 +61,8 @@ CUDAStdArray<T, N>::CUDAStdArray(const CUDAStdArray& other)
 }
 
 template <typename T, size_t N>
-CUDAStdArray<T, N>::CUDAStdArray(CUDAStdArray&& other) noexcept
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAStdArray<T, N>::CUDAStdArray(
+    CUDAStdArray&& other) noexcept
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -68,7 +71,8 @@ CUDAStdArray<T, N>::CUDAStdArray(CUDAStdArray&& other) noexcept
 }
 
 template <typename T, size_t N>
-CUDAStdArray<T, N>& CUDAStdArray<T, N>::operator=(const CUDAStdArray& other)
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAStdArray<T, N>& CUDAStdArray<T, N>::operator=(
+    const CUDAStdArray& other)
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -79,7 +83,8 @@ CUDAStdArray<T, N>& CUDAStdArray<T, N>::operator=(const CUDAStdArray& other)
 }
 
 template <typename T, size_t N>
-CUDAStdArray<T, N>& CUDAStdArray<T, N>::operator=(CUDAStdArray&& other) noexcept
+CUBBYFLOW_CUDA_HOST_DEVICE CUDAStdArray<T, N>& CUDAStdArray<T, N>::operator=(
+    CUDAStdArray&& other) noexcept
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -90,7 +95,7 @@ CUDAStdArray<T, N>& CUDAStdArray<T, N>::operator=(CUDAStdArray&& other) noexcept
 }
 
 template <typename T, size_t N>
-void CUDAStdArray<T, N>::Fill(ConstReference val)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAStdArray<T, N>::Fill(ConstReference val)
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -112,20 +117,22 @@ CUBBYFLOW_CUDA_HOST Vector<T, N> CUDAStdArray<T, N>::ToVector() const
 }
 
 template <typename T, size_t N>
-typename CUDAStdArray<T, N>::Reference CUDAStdArray<T, N>::operator[](size_t i)
+CUBBYFLOW_CUDA_HOST_DEVICE typename CUDAStdArray<T, N>::Reference
+CUDAStdArray<T, N>::operator[](size_t i)
 {
     return m_elements[i];
 }
 
 template <typename T, size_t N>
-typename CUDAStdArray<T, N>::ConstReference CUDAStdArray<T, N>::operator[](
-    size_t i) const
+CUBBYFLOW_CUDA_HOST_DEVICE typename CUDAStdArray<T, N>::ConstReference
+CUDAStdArray<T, N>::operator[](size_t i) const
 {
     return m_elements[i];
 }
 
 template <typename T, size_t N>
-bool CUDAStdArray<T, N>::operator==(const CUDAStdArray& other) const
+CUBBYFLOW_CUDA_HOST_DEVICE bool CUDAStdArray<T, N>::operator==(
+    const CUDAStdArray& other) const
 {
     for (size_t i = 0; i < N; ++i)
     {
@@ -139,14 +146,17 @@ bool CUDAStdArray<T, N>::operator==(const CUDAStdArray& other) const
 }
 
 template <typename T, size_t N>
-bool CUDAStdArray<T, N>::operator!=(const CUDAStdArray& other) const
+CUBBYFLOW_CUDA_HOST_DEVICE bool CUDAStdArray<T, N>::operator!=(
+    const CUDAStdArray& other) const
 {
     return !(*this == other);
 }
 
 template <typename T, size_t N>
 template <typename... Args>
-void CUDAStdArray<T, N>::SetAt(size_t i, ConstReference first, Args... rest)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAStdArray<T, N>::SetAt(size_t i,
+                                                         ConstReference first,
+                                                         Args... rest)
 {
     m_elements[i] = first;
     SetAt(i + 1, rest...);
@@ -154,7 +164,8 @@ void CUDAStdArray<T, N>::SetAt(size_t i, ConstReference first, Args... rest)
 
 template <typename T, size_t N>
 template <typename... Args>
-void CUDAStdArray<T, N>::SetAt(size_t i, ConstReference first)
+CUBBYFLOW_CUDA_HOST_DEVICE void CUDAStdArray<T, N>::SetAt(size_t i,
+                                                         ConstReference first)
 {
     m_elements[i] = first;
 }
diff --git a/Includes/Core/CUDA/CUDAStdVector-Impl.hpp b/Includes/Core/CUDA/CUDAStdVector-Impl.hpp
index eb14862fd8..4d47a26f3b 100644
--- a/Includes/Core/CUDA/CUDAStdVector-Impl.hpp
+++ b/Includes/Core/CUDA/CUDAStdVector-Impl.hpp
@@ -95,7 +95,7 @@ size_t CUDAStdVector<T>::Size() const
     return m_size;
 }
 
-#ifdef __CUDA_ARCH__
+#if defined(__HIP__) || defined(__CUDA_ARCH__)
 template <typename T>
 __device__ typename CUDAStdVector<T>::Reference CUDAStdVector<T>::At(size_t i)
 {
@@ -108,16 +108,18 @@ __device__ typename CUDAStdVector<T>::ConstReference CUDAStdVector<T>::At(
 {
     return m_ptr[i];
 }
-#else
+#endif
+#if defined(__HIP__) || !defined(__CUDA_ARCH__)
 template <typename T>
-typename CUDAStdVector<T>::ReferenceType CUDAStdVector<T>::At(size_t i)
+CUBBYFLOW_CUDA_HOST typename CUDAStdVector<T>::ReferenceType
+CUDAStdVector<T>::At(size_t i)
 {
     ReferenceType r(m_ptr + i);
     return r;
 }
 
 template <typename T>
-T CUDAStdVector<T>::At(size_t i) const
+CUBBYFLOW_CUDA_HOST T CUDAStdVector<T>::At(size_t i) const
 {
     T tmp;
     CUDACopyDeviceToHost(m_ptr + i, 1, &tmp);
@@ -233,28 +235,31 @@ void CUDAStdVector<T>::CopyTo(std::vector<T, A>& other)
     CUDACopyDeviceToHost(m_ptr, m_size, other.data());
 }
 
-#ifdef __CUDA_ARCH__
+#if defined(__HIP__) || defined(__CUDA_ARCH__)
 template <typename T>
-typename CUDAStdVector<T>::Reference CUDAStdVector<T>::operator[](size_t i)
+__device__ typename CUDAStdVector<T>::Reference CUDAStdVector<T>::operator[](
+    size_t i)
 {
     return At(i);
 }
 
 template <typename T>
-typename CUDAStdVector<T>::ConstReference CUDAStdVector<T>::operator[](
-    size_t i) const
+__device__ typename CUDAStdVector<T>::ConstReference
+CUDAStdVector<T>::operator[](size_t i) const
 {
     return At(i);
 }
-#else
+#endif
+#if defined(__HIP__) || !defined(__CUDA_ARCH__)
 template <typename T>
-typename CUDAStdVector<T>::ReferenceType CUDAStdVector<T>::operator[](size_t i)
+CUBBYFLOW_CUDA_HOST typename CUDAStdVector<T>::ReferenceType
+CUDAStdVector<T>::operator[](size_t i)
 {
     return At(i);
 }
 
 template <typename T>
-T CUDAStdVector<T>::operator[](size_t i) const
+CUBBYFLOW_CUDA_HOST T CUDAStdVector<T>::operator[](size_t i) const
 {
     return At(i);
 }
diff --git a/Includes/Core/CUDA/CUDAStdVector.hpp b/Includes/Core/CUDA/CUDAStdVector.hpp
index ac06d699e8..00a40a9303 100644
--- a/Includes/Core/CUDA/CUDAStdVector.hpp
+++ b/Includes/Core/CUDA/CUDAStdVector.hpp
@@ -65,7 +65,7 @@ class CUDAStdVector final
             return *this;
         }
 
-#ifdef __CUDA_ARCH__
+#if defined(__HIP__) || defined(__CUDA_ARCH__)
         __device__ ReferenceType& operator=(const ValueType& val)
         {
             *m_ptr = val;
@@ -76,7 +76,8 @@ class CUDAStdVector final
         {
             return *m_ptr;
         }
-#else
+#endif
+#if defined(__HIP__) || !defined(__CUDA_ARCH__)
         CUBBYFLOW_CUDA_HOST ReferenceType& operator=(const ValueType& val)
         {
             CUDACopyHostToDevice(&val, 1, m_ptr);
@@ -121,11 +122,12 @@ class CUDAStdVector final
 
     size_t Size() const;
 
-#ifdef __CUDA_ARCH__
+#if defined(__HIP__) || defined(__CUDA_ARCH__)
     __device__ Reference At(size_t i);
 
     __device__ ConstReference At(size_t i) const;
-#else
+#endif
+#if defined(__HIP__) || !defined(__CUDA_ARCH__)
     CUBBYFLOW_CUDA_HOST ReferenceType At(size_t i);
 
     CUBBYFLOW_CUDA_HOST T At(size_t i) const;
@@ -155,14 +157,15 @@ class CUDAStdVector final
     template <typename A>
     void CopyTo(std::vector<T, A>& other);
 
-#ifdef __CUDA_ARCH__
-    Reference operator[](size_t i);
+#if defined(__HIP__) || defined(__CUDA_ARCH__)
+    __device__ Reference operator[](size_t i);
 
-    ConstReference operator[](size_t i) const;
-#else
-    ReferenceType operator[](size_t i);
+    __device__ ConstReference operator[](size_t i) const;
+#endif
+#if defined(__HIP__) || !defined(__CUDA_ARCH__)
+    CUBBYFLOW_CUDA_HOST ReferenceType operator[](size_t i);
 
-    T operator[](size_t i) const;
+    CUBBYFLOW_CUDA_HOST T operator[](size_t i) const;
 #endif
 
  private:
diff --git a/Includes/Core/CUDA/CUDAUtils.hpp b/Includes/Core/CUDA/CUDAUtils.hpp
index 4ebaeda9e4..074709adec 100644
--- a/Includes/Core/CUDA/CUDAUtils.hpp
+++ b/Includes/Core/CUDA/CUDAUtils.hpp
@@ -15,7 +15,11 @@
 
 #include <Core/Utils/Macros.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 #include <algorithm>
 #include <cstdint>
@@ -37,6 +41,11 @@ inline CUBBYFLOW_CUDA_HOST void CUDAComputeGridSize(unsigned int n,
     numBlocks = CUDADivRoundUp(n, numThreads);
 }
 
+// HIP's vector types (HIP_vector_type) already provide arithmetic, compound
+// assignment, and equality operators for floatN, so these CUDA-side definitions
+// would be ambiguous on ROCm. Only the named helpers below (Dot, Length, To*)
+// are kept on both backends.
+#if !defined(__HIP__)
 inline CUBBYFLOW_CUDA_HOST_DEVICE float2 operator+(float2 a, float2 b)
 {
     return make_float2(a.x + b.x, a.y + b.y);
@@ -341,6 +350,7 @@ inline CUBBYFLOW_CUDA_HOST_DEVICE bool operator==(float4 a, float4 b)
     return std::abs(a.x - b.x) < 1e-6f && std::abs(a.y - b.y) < 1e-6f &&
            std::abs(a.z - b.z) < 1e-6f && std::abs(a.w - b.w) < 1e-6f;
 }
+#endif  // !__HIP__
 
 inline CUBBYFLOW_CUDA_HOST_DEVICE float Dot(float2 a, float2 b)
 {
diff --git a/Includes/Core/CUDA/ThrustUtils.hpp b/Includes/Core/CUDA/ThrustUtils.hpp
index 1c8b367607..623c680164 100644
--- a/Includes/Core/CUDA/ThrustUtils.hpp
+++ b/Includes/Core/CUDA/ThrustUtils.hpp
@@ -15,7 +15,11 @@
 
 #include <Core/CUDA/CUDAArray.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 #include <thrust/device_ptr.h>
 
diff --git a/Includes/Core/CUDA/cuda_to_hip.h b/Includes/Core/CUDA/cuda_to_hip.h
new file mode 100644
index 0000000000..51fe10363c
--- /dev/null
+++ b/Includes/Core/CUDA/cuda_to_hip.h
@@ -0,0 +1,52 @@
+// CubbyFlow is voxel-based fluid simulation engine for computer games.
+// Copyright (c) 2020 CubbyFlow Team
+// Copyright (c) 2026 Advanced Micro Devices, Inc.
+//
+// HIP/ROCm compatibility shim. Force-included on every HIP translation unit so
+// the project's CUDA spelling (runtime API, vector types, and the device-vs-host
+// preprocessor idioms) maps onto HIP without rewriting the .cu/.hpp sources.
+//
+// \author Jeff Daily <jeff.daily@amd.com>
+
+#ifndef CUBBYFLOW_CUDA_TO_HIP_H
+#define CUBBYFLOW_CUDA_TO_HIP_H
+
+#if defined(CUBBYFLOW_USE_CUDA) && defined(__HIP__)
+
+#include <hip/hip_runtime.h>
+
+// hipcc defines __HIPCC__ on both compile passes but NOT __CUDACC__. The project
+// gates its kernel definitions and host/device attribute macros on __CUDACC__;
+// those guards are extended in-place to also accept __HIPCC__. __CUDACC__ itself
+// is deliberately left undefined here: rocThrust keys its backend selection on
+// __CUDACC__ (thrust/detail/config/compiler.h) and would otherwise fall back to
+// its CUDA backend (pulling a CUDA-only CUB header) instead of the HIP backend.
+
+// __CUDA_ARCH__ is undefined under HIP; HIP signals the device compile pass with
+// __HIP_DEVICE_COMPILE__. The project selects between device (T&) and host
+// (copy-back wrapper) return types for accessors via #ifdef __CUDA_ARCH__. clang
+// compiles host and device in separate passes like nvcc, so defining __CUDA_ARCH__
+// only in the device pass makes that per-pass selection resolve correctly.
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ && \
+    !defined(__CUDA_ARCH__)
+#define __CUDA_ARCH__ 1
+#endif
+
+// Runtime API: the project uses only a handful of symbols.
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaDeviceReset hipDeviceReset
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+
+#endif  // CUBBYFLOW_USE_CUDA && __HIP__
+
+#endif  // CUBBYFLOW_CUDA_TO_HIP_H
diff --git a/Includes/Core/Utils/Macros.hpp b/Includes/Core/Utils/Macros.hpp
index 246b80dd82..7fdc240182 100644
--- a/Includes/Core/Utils/Macros.hpp
+++ b/Includes/Core/Utils/Macros.hpp
@@ -14,17 +14,17 @@
 #ifdef CUBBYFLOW_USE_CUDA
 
 // Host vs. device
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 #define CUBBYFLOW_CUDA_DEVICE __device__
 #define CUBBYFLOW_CUDA_HOST __host__
 #else
 #define CUBBYFLOW_CUDA_DEVICE
 #define CUBBYFLOW_CUDA_HOST
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__
 #define CUBBYFLOW_CUDA_HOST_DEVICE CUBBYFLOW_CUDA_HOST CUBBYFLOW_CUDA_DEVICE
 
 // Alignment
-#ifdef __CUDACC__  // NVCC
+#if defined(__CUDACC__) || defined(__HIPCC__)  // NVCC or hipcc
 #define CUBBYFLOW_CUDA_ALIGN(n) __align__(n)
 #elif defined(__GNUC__)  // GCC
 #define CUBBYFLOW_CUDA_ALIGN(n) __attribute__((aligned(n)))
@@ -32,17 +32,24 @@
 #define CUBBYFLOW_CUDA_ALIGN(n) __declspec(align(n))
 #else
 #error "Don't know how to handle CUBBYFLOW_CUDA_ALIGN"
-#endif  // __CUDACC__
+#endif  // __CUDACC__ || __HIPCC__
 
 // Exception
+// The result is bound to a typed local first: HIP marks the runtime error type
+// nodiscard, so feeding an API call straight into the comparison trips
+// -Werror=unused-value under clang. Binding consumes the value explicitly.
 #define _CUBBYFLOW_CUDA_CHECK(result, msg, file, line)                      \
-    if (result != cudaSuccess)                                              \
     {                                                                       \
-        fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, \
-                line, static_cast<unsigned int>(result),                    \
-                cudaGetErrorString(result), msg);                           \
-        cudaDeviceReset();                                                  \
-        exit(EXIT_FAILURE);                                                 \
+        cudaError_t _cubbyflow_cuda_err = (result);                         \
+        if (_cubbyflow_cuda_err != cudaSuccess)                             \
+        {                                                                   \
+            fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n",    \
+                    file, line,                                             \
+                    static_cast<unsigned int>(_cubbyflow_cuda_err),         \
+                    cudaGetErrorString(_cubbyflow_cuda_err), msg);          \
+            static_cast<void>(cudaDeviceReset());                           \
+            exit(EXIT_FAILURE);                                            \
+        }                                                                   \
     }
 
 #define CUBBYFLOW_CUDA_CHECK(expression) \
diff --git a/Sources/Core/CMakeLists.txt b/Sources/Core/CMakeLists.txt
index 41a5e721ea..c5cb7402a1 100644
--- a/Sources/Core/CMakeLists.txt
+++ b/Sources/Core/CMakeLists.txt
@@ -18,6 +18,16 @@ if (USE_CUDA)
     file(GLOB_RECURSE sources
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/CUDA/*.cu)
+elseif (USE_HIP)
+    file(GLOB_RECURSE sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/CUDA/*.cu)
+    # The CUDA directory's .cpp host sources also touch device types (device
+    # memory, thrust device_ptr), so compile the whole directory with hipcc.
+    file(GLOB_RECURSE hip_sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/CUDA/*.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/CUDA/*.cpp)
+    set_source_files_properties(${hip_sources} PROPERTIES LANGUAGE HIP)
 else()
     file(GLOB_RECURSE sources
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
diff --git a/Sources/Core/CUDA/CUDASPHSystemData2Func.hpp b/Sources/Core/CUDA/CUDASPHSystemData2Func.hpp
index af7f3ca40b..9f03f52c77 100644
--- a/Sources/Core/CUDA/CUDASPHSystemData2Func.hpp
+++ b/Sources/Core/CUDA/CUDASPHSystemData2Func.hpp
@@ -17,7 +17,11 @@
 
 #include <Core/CUDA/CUDASPHKernels2.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace CubbyFlow
 {
diff --git a/Sources/Core/CUDA/CUDASPHSystemData3Func.hpp b/Sources/Core/CUDA/CUDASPHSystemData3Func.hpp
index 1cb7b59570..73a4c0941e 100644
--- a/Sources/Core/CUDA/CUDASPHSystemData3Func.hpp
+++ b/Sources/Core/CUDA/CUDASPHSystemData3Func.hpp
@@ -17,7 +17,11 @@
 
 #include <Core/CUDA/CUDASPHKernels3.hpp>
 
+#if defined(__HIP__)
+#include <Core/CUDA/cuda_to_hip.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace CubbyFlow
 {
diff --git a/Tests/CUDATests/CMakeLists.txt b/Tests/CUDATests/CMakeLists.txt
index 393b9c2cf2..37ab7eb8f0 100644
--- a/Tests/CUDATests/CMakeLists.txt
+++ b/Tests/CUDATests/CMakeLists.txt
@@ -7,9 +7,15 @@ file(GLOB_RECURSE sources
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 
 # Build executable
-cuda_add_executable(${target} ${sources}
-    OPTIONS ${CUDA_TARGET_OPTIONS}
-)
+if (USE_HIP)
+    file(GLOB_RECURSE hip_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
+    set_source_files_properties(${hip_sources} PROPERTIES LANGUAGE HIP)
+    add_executable(${target} ${sources})
+else()
+    cuda_add_executable(${target} ${sources}
+        OPTIONS ${CUDA_TARGET_OPTIONS}
+    )
+endif()
 
 # Project options
 set_target_properties(${target}
diff --git a/Tests/CUDATests/CUDAArray2Tests.cu b/Tests/CUDATests/CUDAArray2Tests.cu
index 87763fa317..e41152e700 100644
--- a/Tests/CUDATests/CUDAArray2Tests.cu
+++ b/Tests/CUDATests/CUDAArray2Tests.cu
@@ -299,7 +299,7 @@ TEST_CASE("[CUDAArray2] - Resize")
         }
 
         arr.Resize(CUDAStdArray<size_t, 2>(8, 13), 4.f);
-        cudaDeviceSynchronize();
+        CHECK_EQ(cudaSuccess, cudaDeviceSynchronize());
         CHECK_EQ(8u, arr.Width());
         CHECK_EQ(13u, arr.Height());
         for (size_t i = 0; i < 8; ++i)
@@ -328,6 +328,7 @@ TEST_CASE("[CUDAArray2] - Resize")
         }
 
         arr.Resize(1, 9, 3.f);
+        CHECK_EQ(cudaSuccess, cudaDeviceSynchronize());
         CHECK_EQ(1u, arr.Width());
         CHECK_EQ(9u, arr.Height());
         for (size_t i = 0; i < 1; ++i)