NVIDIA · iwald-nvidia · Jun 17, 2026 · Jun 17, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,10 +13,27 @@ cmake_policy(SET CMP0048 NEW)
 set(CMAKE_BUILD_TYPE_INIT "Release")
 project(cuBQL VERSION ${CUBQL_VERSION} LANGUAGES C CXX)
 
+# Build the GPU code with HIP for AMD GPUs instead of CUDA. When ON, the .cu
+# sources are compiled as HIP (LANGUAGE HIP) and CUBQL_HAVE_HIP is set; the
+# per-type instantiation targets and GPU samples are built the same way as for
+# CUDA, gated on (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP).
+option(CUBQL_USE_HIP "Build cuBQL GPU code with HIP for AMD GPUs" OFF)
+
+# CUBQL_HAVE_CUDA / CUBQL_HAVE_HIP record which (if any) GPU toolchain is in
+# use; the GPU instantiation targets and samples below are gated on either.
+set(CUBQL_HAVE_CUDA OFF)
+set(CUBQL_HAVE_HIP OFF)
 if (CUBQL_OMP)
   set(CUBQL_DISABLE_CUDA ON)
 endif()
-if (CUBQL_DISABLE_CUDA)
+if (CUBQL_USE_HIP)
+  message("#cuBQL: building GPU code with HIP for AMD GPUs")
+  enable_language(HIP)
+  if (NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
+    set(CMAKE_HIP_ARCHITECTURES "gfx90a")
+  endif()
+  set(CUBQL_HAVE_HIP ON)
+elseif (CUBQL_DISABLE_CUDA)
   message("#cuBQL: CUDA _DISABLED_ by user request")
   set(CUBQL_HAVE_CUDA OFF)
 else()
@@ -95,7 +112,10 @@ if (CUBQL_IS_SUBPROJECT)
       )
   endif()
 else()
-  if (CMAKE_CUDA_ARCHITECTURES)
+  if (CUBQL_USE_HIP)
+    # HIP build: the arch lives in CMAKE_HIP_ARCHITECTURES (defaulted above);
+    # do not derive or require a CUDA arch.
+  elseif (CMAKE_CUDA_ARCHITECTURES)
     # set on the cmdline, for a local build
     set(CUBQL_CUDA_ARCHITECTURES_INIT "${CMAKE_CUDA_ARCHITECTURES}")
   elseif (${CMAKE_VERSION} VERSION_LESS 3.24)
@@ -119,10 +139,12 @@ else()
   else()
     set(CUBQL_CUDA_ARCHITECTURES_INIT "native")
   endif()
-  set(CUBQL_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES_INIT}" CACHE STRING
-    "Which CUDA architecture to build for")
-  set(CMAKE_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES}")
-
+  if (NOT CUBQL_USE_HIP)
+    set(CUBQL_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES_INIT}" CACHE STRING
+      "Which CUDA architecture to build for")
+    set(CMAKE_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES}")
+  endif()
+
   if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
     set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) 
   else()

diff --git a/README.md b/README.md
@@ -316,6 +316,12 @@ However, we strongly suggest to use `cmake`, include cuBQL as a cmake
 `add_subdirectory(...)`, and then `target_link_libraries(...)` with
 the desired cuBQL cmake target.
 
+cuBQL also builds for AMD GPUs through ROCm/HIP. Configure with
+`-DCUBQL_USE_HIP=ON` (and set `CMAKE_HIP_ARCHITECTURES` to the target
+GPU, for example `gfx90a`) to compile the GPU builders and traversers
+with `hipcc` instead of `nvcc`; the same headers and cmake targets are
+used. Without `CUBQL_USE_HIP` the build is the usual CUDA build.
+
 ## Building in Header-only (explicit instantiation) mode:
 
 - in your own CUDA sources (say, `userMain.cu`):

diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt
@@ -66,6 +66,10 @@ function(add_specific_instantiation device suffix T D)
       PUBLIC
       -DCUBQL_HAVE_CUDA=1
       )
+    if (CUBQL_USE_HIP)
+      set_source_files_properties(builder/${device}/instantiate_builders.${suffix}
+        PROPERTIES LANGUAGE HIP)
+    endif()
   endif()
   set_target_properties(cuBQL_${device}_${T}${D}
     PROPERTIES
@@ -91,6 +95,10 @@ function(add_specific_instantiation device suffix T D)
       PUBLIC
       -DCUBQL_HAVE_CUDA=1
       )
+    if (CUBQL_USE_HIP)
+      set_source_files_properties(builder/${device}/instantiate_builders.${suffix}
+        PROPERTIES LANGUAGE HIP)
+    endif()
   endif()
   set_target_properties(cuBQL_${device}_${T}${D}_static
     PROPERTIES
@@ -130,7 +138,7 @@ endfunction()
 foreach(T IN ITEMS float int double longlong)
   foreach(D IN ITEMS 2 3 4)
     add_specific_instantiation(cpu cpp ${T} ${D})
-    if (CUBQL_HAVE_CUDA)
+    if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
       add_specific_instantiation(cuda cu ${T} ${D})
     endif()
   endforeach()

diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h
@@ -54,7 +54,10 @@ namespace cuBQL {
     }
   };
 
-#if CUDART_VERSION >= 11020
+// hipMallocAsync and the hipMemPool API are available on ROCm; on the HIP
+// toolchain CUDART_VERSION is undefined (0), so select the async path
+// explicitly there. The CUDA arm keeps its >= 11020 floor unchanged.
+#if defined(__HIPCC__) || CUDART_VERSION >= 11020
   /* Allocator that uses cudaMallocAsync to allocate memory. This can
      be much faster than cudaMalloc because it doesn't require a
      device sync for each malloc; but .. CAREFUL: to get memory to be

diff --git a/cuBQL/bvh.h b/cuBQL/bvh.h
@@ -145,15 +145,15 @@ namespace cuBQL {
   using bvh3f = BinaryBVH<float,3>;
   using bvh3d = BinaryBVH<double,3>;
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
   typedef BinaryBVH<float,2> bvh_float2;
   typedef BinaryBVH<float,3> bvh_float3;
   typedef BinaryBVH<float,4> bvh_float4;
 #endif
-  
+
 } // ::cuBQL
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
 # include "cuBQL/builder/cuda.h"
 #endif
 

diff --git a/cuBQL/math/common.h b/cuBQL/math/common.h
@@ -28,6 +28,48 @@
 #  include <hip/hip_runtime.h>
 #  include <hip/driver_types.h>
 #  include <hip/hip_runtime.h>
+// cuBQL spells the device runtime API as cudaXxx (call sites and the
+// CUBQL_CUDA_CALL(call) -> cuda##call token-paste macro). HIP does not
+// provide the cudaXxx aliases, so map the exact set cuBQL uses onto the
+// hipXxx equivalents. Confined to the __HIPCC__ path so the CUDA build is
+// unchanged.
+using cudaError_t  = hipError_t;
+using cudaStream_t = hipStream_t;
+using cudaEvent_t  = hipEvent_t;
+using cudaMemPool_t = hipMemPool_t;
+#  define cudaSuccess                      hipSuccess
+#  define cudaMemcpyDeviceToHost           hipMemcpyDeviceToHost
+#  define cudaMemcpyHostToDevice           hipMemcpyHostToDevice
+#  define cudaMemcpyDeviceToDevice         hipMemcpyDeviceToDevice
+#  define cudaMemcpyDefault                hipMemcpyDefault
+#  define cudaMemPoolAttrReleaseThreshold  hipMemPoolAttrReleaseThreshold
+#  define cudaGetErrorString               hipGetErrorString
+#  define cudaGetLastError                 hipGetLastError
+#  define cudaDeviceSynchronize            hipDeviceSynchronize
+#  define cudaStreamSynchronize            hipStreamSynchronize
+#  define cudaGetDevice                    hipGetDevice
+#  define cudaSetDevice                    hipSetDevice
+#  define cudaGetDeviceCount               hipGetDeviceCount
+#  define cudaMalloc                       hipMalloc
+#  define cudaMallocHost                   hipHostMalloc
+#  define cudaFree                         hipFree
+#  define cudaFreeHost                     hipHostFree
+#  define cudaMallocManaged                hipMallocManaged
+#  define cudaMallocAsync                  hipMallocAsync
+#  define cudaFreeAsync                    hipFreeAsync
+#  define cudaMemcpy                       hipMemcpy
+#  define cudaMemcpyAsync                  hipMemcpyAsync
+#  define cudaMemcpyFromSymbol             hipMemcpyFromSymbol
+#  define cudaMemcpyToSymbol               hipMemcpyToSymbol
+#  define cudaMemset                       hipMemset
+#  define cudaMemsetAsync                  hipMemsetAsync
+#  define cudaDeviceGetDefaultMemPool      hipDeviceGetDefaultMemPool
+#  define cudaMemPoolSetAttribute          hipMemPoolSetAttribute
+#  define cudaEventCreate                  hipEventCreate
+#  define cudaEventDestroy                 hipEventDestroy
+#  define cudaEventRecord                  hipEventRecord
+#  define cudaEventSynchronize             hipEventSynchronize
+#  define cudaEventElapsedTime             hipEventElapsedTime
 #elif defined(__CUDACC__)
 #  include <cuda_runtime.h>
 // on cuda 13.2/gcc 15.2/ubuntu 25.10 we need to include that before cuda.h gets included

diff --git a/cuBQL/math/constants.h b/cuBQL/math/constants.h
@@ -8,6 +8,23 @@
 #if defined(__CUDACC__) && !defined(CUDART_INF_F)
 #include <math_constants.h>
 #endif
+#if defined(__HIPCC__)
+// CUDA's <math_constants.h> (the CUDART_* device float constants) has no HIP
+// analogue; provide the ones cuBQL uses so the shared device code compiles
+// unchanged on the HIP path.
+# ifndef CUDART_INF_F
+#  define CUDART_INF_F __builtin_huge_valf()
+# endif
+# ifndef CUDART_INF
+#  define CUDART_INF   __builtin_huge_val()
+# endif
+# ifndef CUDART_NAN_F
+#  define CUDART_NAN_F __builtin_nanf("")
+# endif
+# ifndef CUDART_NAN
+#  define CUDART_NAN   __builtin_nan("")
+# endif
+#endif
 
 #ifndef M_PI
 #define M_PI 3.141593f

diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h
@@ -84,10 +84,10 @@ namespace cuBQL {
     using cuda_t = typename cuda_eq_t<T,2>::type;
     inline __cubql_both operator cuda_t() { cuda_t t; t.x = x; t.y = y; return t; }
 #endif
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
     /*! allow to typecast that to a dim3, so it can be used as a cuda kernel launch dim */
     inline __cubql_both operator dim3() { dim3 t; t.x = x; t.y = y; t.z = 1; return t; }
-#endif    
+#endif
     T x, y;
   };
   template<typename T>
@@ -96,10 +96,10 @@ namespace cuBQL {
     inline __cubql_both T  operator[](int i) const { return (i==2)?z:(i?y:x); }
     inline __cubql_both T &operator[](int i)       { return (i==2)?z:(i?y:x); }
     /*! auto-cast to equivalent cuda type */
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
     /*! allow to typecast that to a dim3, so it can be used as a cuda kernel launch dim */
     inline __cubql_both operator dim3() { dim3 t; t.x = x; t.y = y; t.z = z; return t; }
-#endif    
+#endif
 #if CUBQL_SUPPORT_CUDA_VECTOR_TYPES
     using cuda_t = typename cuda_eq_t<T,3>::type;
     inline __cubql_both operator cuda_t() { cuda_t t; t.x = x; t.y = y; t.z = z; return t; }

diff --git a/cuBQL/queries/pointData/findClosest.h b/cuBQL/queries/pointData/findClosest.h
@@ -9,6 +9,17 @@
 #include "cuBQL/bvh.h"
 #include "cuBQL/traversal/shrinkingRadiusQuery.h"
 
+// The findClosest* queries are forward-declared __host__ __device__ (so the
+// CPU sample can call them) but defined __device__-only. nvcc tolerates that
+// decl/def host/device-attribute mismatch; HIP/clang rejects it. On the HIP
+// device path align the forward declarations to the device-only definitions.
+// Outside HIP (CUDA and the plain-C++ CPU build) the spelling is unchanged.
+#if defined(__HIPCC__)
+# define __cubql_findClosest_decl __cubql_device
+#else
+# define __cubql_findClosest_decl __cubql_both
+#endif
+
 namespace cuBQL {
   namespace points {
 
@@ -34,7 +45,7 @@ namespace cuBQL {
       not)
     */
     template<typename T, int D>
-    inline __cubql_both
+    inline __cubql_findClosest_decl
     int findClosest(/*! binary bvh built over the given points[]
                       specfied below */
                     BinaryBVH<T,D> bvhOverPoints,
@@ -65,7 +76,7 @@ namespace cuBQL {
       not)
     */
     template<typename T, int D, int W>
-    inline __cubql_both
+    inline __cubql_findClosest_decl
     int findClosest(/*! binary bvh built over the given points[]
                       specfied below */
                     WideBVH<T,D,W> bvhOverPoints,
@@ -81,7 +92,7 @@ namespace cuBQL {
 
 
     template<typename T, int D>
-    inline __cubql_both
+    inline __cubql_findClosest_decl
     int findClosest_exludeID(/*! primitive ID to _exclude_ from queries */
                              int idOfPointtoExclude,
                              /*! binary bvh built over the given points[]

diff --git a/cuBQL/traversal/shrinkingRadiusQuery.h b/cuBQL/traversal/shrinkingRadiusQuery.h
@@ -337,7 +337,8 @@ namespace cuBQL {
       }
     }
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    // device pass: use the toolchain's __int_as_float/__float_as_int builtins
 #else
     inline float __int_as_float(int i) { return (const float &)i; }
     inline int __float_as_int(float f) { return (const int &)f; }

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -1,6 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+# When building the GPU code with HIP, the sample .cu sources must be compiled
+# as HIP. Mark a target's CUDA sources accordingly.
+function(cubql_sample_hip_lang target)
+  if (CUBQL_USE_HIP)
+    get_target_property(_srcs ${target} SOURCES)
+    foreach(_s IN LISTS _srcs)
+      if (_s MATCHES "\\.cu$")
+        set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP)
+      endif()
+    endforeach()
+  endif()
+endfunction()
+
 # common stuff - data generators and IO for loading/generating test data
 add_subdirectory(common)
 if (NOT (TARGET stb_image))
@@ -19,10 +32,11 @@ add_subdirectory(s01_closestPoint_points_gpu)
 # sample that performs gpu-side closest-point queries between sets of
 # float3 points
 # ==================================================================
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
   add_executable(sample02_distanceToTriangleMesh
     s02_distanceToTriangleMesh/distanceToTriangleMesh.cu
     )
+  cubql_sample_hip_lang(sample02_distanceToTriangleMesh)
   target_link_libraries(sample02_distanceToTriangleMesh
     # the cuda-side builders for float3 data
     cuBQL_cuda_float3
@@ -40,10 +54,11 @@ endif()
 # NxNxN grid. QUeries are performed by tracing axis-aligned rays and
 # computing cuqbl::crossingCount queries.
 # ==================================================================
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
   add_executable(sample03_insideOutside
     s03_insideOutsideOfClosedMesh/insideOutside.cu
     )
+  cubql_sample_hip_lang(sample03_insideOutside)
   target_link_libraries(sample03_insideOutside
     # the cuda-side builders for float3 data
     cuBQL_cuda_float3
@@ -59,10 +74,11 @@ endif()
 # full inside the surface (0), fully outside (1), or whether it
 # intersects with one of the triangles of the surface mesh
 # ==================================================================
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
   add_executable(sample04_boxOverlapsOrInsideSurfaceMesh
     s04_boxOverlapsOrInsideSurfaceMesh/boxOverlapsOrInsideSurfaceMesh.cu
     )
+  cubql_sample_hip_lang(sample04_boxOverlapsOrInsideSurfaceMesh)
   target_link_libraries(sample04_boxOverlapsOrInsideSurfaceMesh
     # the cuda-side builders for float3 data
     cuBQL_cuda_float3
@@ -79,10 +95,11 @@ endif()
 # whether the ray did pass through unoccluded (white) or got occluded
 # (black), then saves that as an image
 # ==================================================================
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
   add_executable(sample05_lineOfSight
     s05_lineOfSight/lineOfSight.cu
     )
+  cubql_sample_hip_lang(sample05_lineOfSight)
   target_link_libraries(sample05_lineOfSight
     # the cuda-side builders for float3 data
     cuBQL_cuda_float3
@@ -99,10 +116,11 @@ endif()
 # if not. Chosen distance for this sample is 1% of scene diameter;
 # this uses the triangles::anyWithinRadius() query.
 # ==================================================================
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
   add_executable(sample06_anyTriangleWithinRadius
     s06_anyTriangleWithinRadius/anyTriangleWithinRadius.cu
     )
+  cubql_sample_hip_lang(sample06_anyTriangleWithinRadius)
   target_link_libraries(sample06_anyTriangleWithinRadius
     # the cuda-side builders for float3 data
     cuBQL_cuda_float3
@@ -112,10 +130,11 @@ if (CUBQL_HAVE_CUDA)
     )
 endif()
 
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
   add_executable(sample07_aggregateNBody
     s07_aggregateNBody/aggregateNBody.cu
     )
+  cubql_sample_hip_lang(sample07_aggregateNBody)
   target_link_libraries(sample07_aggregateNBody
     # the cuda-side builders for float3 data
     cuBQL_cuda_float3

diff --git a/samples/s01_closestPoint_points_gpu/CMakeLists.txt b/samples/s01_closestPoint_points_gpu/CMakeLists.txt
@@ -1,7 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-if (CUBQL_HAVE_CUDA)
+if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
+  if (CUBQL_USE_HIP)
+    set_source_files_properties(closestPoint.cu closestPoint_WideBVH.cu
+      PROPERTIES LANGUAGE HIP)
+  endif()
   add_executable(cuBQL_sample01_points_closestPoint_cuda
     closestPoint.cu
     )