From b0ea6a1712b40809df28950c9bb3746bc6e99083 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Wed, 17 Jun 2026 00:28:42 +0000 Subject: [PATCH] [ROCm] Add HIP build path for AMD GPUs cuBQL already carried first-party HIP source scaffolding (the __HIPCC__ includes of and the cub->hipcub alias), but nothing ever invoked hipcc and several HIP gaps left that path uncompilable. This finishes and wires up the HIP path so the library, the instantiate_builders translation unit, and all GPU samples build and link for AMD GPUs, while keeping the CUDA build byte-identical. Review order: the CMake change first (root CMakeLists.txt adds a CUBQL_USE_HIP option that enables the HIP language, defaults CMAKE_HIP_ARCHITECTURES without hardcoding an arch, and sets a distinct CUBQL_HAVE_HIP switch alongside CUBQL_HAVE_CUDA; the GPU instantiation targets and samples build when either is set, and cuBQL/CMakeLists.txt and the sample CMake mark the .cu sources LANGUAGE HIP), then the source shims. math/common.h maps the cudaXxx runtime symbols cuBQL uses onto their hipXxx equivalents inside the existing __HIPCC__ block (every call funnels through the CUBQL_CUDA_CALL(call) -> cuda##call macro). math/constants.h provides the CUDART_INF_F/CUDART_NAN device float constants, which have no HIP analogue. builder/cuda.h widens the cudaMallocAsync version guard to fire on HIP (CUDART_VERSION is 0 there). bvh.h includes the GPU builder and declares the bvh_floatN typedefs under HIP too. shrinkingRadiusQuery.h fixes a host/device fallback that was gated on __CUDA_ARCH__ to also key off __HIP_DEVICE_COMPILE__ (HIP's per-pass macro), and findClosest.h aligns three forward declarations to their device-only definitions on the HIP path (clang rejects the decl/def host/device-attribute mismatch that nvcc tolerated). vec.h widens the dim3 conversion operators to HIP. The README's Building section documents the new CUBQL_USE_HIP option. Every source change is guarded by __HIPCC__ / __HIP_DEVICE_COMPILE__, and every CMake change by CUBQL_USE_HIP (default OFF) / LANGUAGE HIP, so a build without CUBQL_USE_HIP produces an unchanged CUDA configuration. This work was authored with the assistance of Claude, an AI assistant. Test Plan: cmake -S . -B build-hip -DCUBQL_USE_HIP=ON \ -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_BUILD_TYPE=Release cmake --build build-hip -j --target \ cuBQL_cuda_float3 \ cuBQL_sample01_points_closestPoint_cuda \ cuBQL_sample01_points_closestPoint_wideBVH_cuda \ sample02_distanceToTriangleMesh sample03_insideOutside \ sample04_boxOverlapsOrInsideSurfaceMesh sample05_lineOfSight \ sample06_anyTriangleWithinRadius sample07_aggregateNBody HIP_VISIBLE_DEVICES=0 ./build-hip/cuBQL_sample01_points_closestPoint_cuda Built and linked the library, the instantiate_builders TU, and all eight GPU samples with hipcc (ROCm 7.2.1) for gfx90a, gfx1100, and gfx1201; the device code objects are amdgcn. sample01 closest-point and samples 02-07 run on GPU and return correct results. Configuring without CUBQL_USE_HIP keeps CMAKE_CUDA_ARCHITECTURES at native and never enables HIP; the CPU host targets build clean in both modes. --- CMakeLists.txt | 34 ++++++++++++--- README.md | 6 +++ cuBQL/CMakeLists.txt | 10 ++++- cuBQL/builder/cuda.h | 5 ++- cuBQL/bvh.h | 6 +-- cuBQL/math/common.h | 42 +++++++++++++++++++ cuBQL/math/constants.h | 17 ++++++++ cuBQL/math/vec.h | 8 ++-- cuBQL/queries/pointData/findClosest.h | 17 ++++++-- cuBQL/traversal/shrinkingRadiusQuery.h | 3 +- samples/CMakeLists.txt | 31 +++++++++++--- .../CMakeLists.txt | 6 ++- 12 files changed, 159 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 51fef82..6669343 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,10 +13,27 @@ cmake_policy(SET CMP0048 NEW) set(CMAKE_BUILD_TYPE_INIT "Release") project(cuBQL VERSION ${CUBQL_VERSION} LANGUAGES C CXX) +# Build the GPU code with HIP for AMD GPUs instead of CUDA. When ON, the .cu +# sources are compiled as HIP (LANGUAGE HIP) and CUBQL_HAVE_HIP is set; the +# per-type instantiation targets and GPU samples are built the same way as for +# CUDA, gated on (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP). +option(CUBQL_USE_HIP "Build cuBQL GPU code with HIP for AMD GPUs" OFF) + +# CUBQL_HAVE_CUDA / CUBQL_HAVE_HIP record which (if any) GPU toolchain is in +# use; the GPU instantiation targets and samples below are gated on either. +set(CUBQL_HAVE_CUDA OFF) +set(CUBQL_HAVE_HIP OFF) if (CUBQL_OMP) set(CUBQL_DISABLE_CUDA ON) endif() -if (CUBQL_DISABLE_CUDA) +if (CUBQL_USE_HIP) + message("#cuBQL: building GPU code with HIP for AMD GPUs") + enable_language(HIP) + if (NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "gfx90a") + endif() + set(CUBQL_HAVE_HIP ON) +elseif (CUBQL_DISABLE_CUDA) message("#cuBQL: CUDA _DISABLED_ by user request") set(CUBQL_HAVE_CUDA OFF) else() @@ -95,7 +112,10 @@ if (CUBQL_IS_SUBPROJECT) ) endif() else() - if (CMAKE_CUDA_ARCHITECTURES) + if (CUBQL_USE_HIP) + # HIP build: the arch lives in CMAKE_HIP_ARCHITECTURES (defaulted above); + # do not derive or require a CUDA arch. + elseif (CMAKE_CUDA_ARCHITECTURES) # set on the cmdline, for a local build set(CUBQL_CUDA_ARCHITECTURES_INIT "${CMAKE_CUDA_ARCHITECTURES}") elseif (${CMAKE_VERSION} VERSION_LESS 3.24) @@ -119,10 +139,12 @@ else() else() set(CUBQL_CUDA_ARCHITECTURES_INIT "native") endif() - set(CUBQL_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES_INIT}" CACHE STRING - "Which CUDA architecture to build for") - set(CMAKE_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES}") - + if (NOT CUBQL_USE_HIP) + set(CUBQL_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES_INIT}" CACHE STRING + "Which CUDA architecture to build for") + set(CMAKE_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES}") + endif() + if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator? set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) else() diff --git a/README.md b/README.md index fd5e872..1ff09ff 100644 --- a/README.md +++ b/README.md @@ -316,6 +316,12 @@ However, we strongly suggest to use `cmake`, include cuBQL as a cmake `add_subdirectory(...)`, and then `target_link_libraries(...)` with the desired cuBQL cmake target. +cuBQL also builds for AMD GPUs through ROCm/HIP. Configure with +`-DCUBQL_USE_HIP=ON` (and set `CMAKE_HIP_ARCHITECTURES` to the target +GPU, for example `gfx90a`) to compile the GPU builders and traversers +with `hipcc` instead of `nvcc`; the same headers and cmake targets are +used. Without `CUBQL_USE_HIP` the build is the usual CUDA build. + ## Building in Header-only (explicit instantiation) mode: - in your own CUDA sources (say, `userMain.cu`): diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt index 5f2c7d8..5c48af9 100644 --- a/cuBQL/CMakeLists.txt +++ b/cuBQL/CMakeLists.txt @@ -66,6 +66,10 @@ function(add_specific_instantiation device suffix T D) PUBLIC -DCUBQL_HAVE_CUDA=1 ) + if (CUBQL_USE_HIP) + set_source_files_properties(builder/${device}/instantiate_builders.${suffix} + PROPERTIES LANGUAGE HIP) + endif() endif() set_target_properties(cuBQL_${device}_${T}${D} PROPERTIES @@ -91,6 +95,10 @@ function(add_specific_instantiation device suffix T D) PUBLIC -DCUBQL_HAVE_CUDA=1 ) + if (CUBQL_USE_HIP) + set_source_files_properties(builder/${device}/instantiate_builders.${suffix} + PROPERTIES LANGUAGE HIP) + endif() endif() set_target_properties(cuBQL_${device}_${T}${D}_static PROPERTIES @@ -130,7 +138,7 @@ endfunction() foreach(T IN ITEMS float int double longlong) foreach(D IN ITEMS 2 3 4) add_specific_instantiation(cpu cpp ${T} ${D}) - if (CUBQL_HAVE_CUDA) + if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_specific_instantiation(cuda cu ${T} ${D}) endif() endforeach() diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h index 061091b..a8f9e7f 100644 --- a/cuBQL/builder/cuda.h +++ b/cuBQL/builder/cuda.h @@ -54,7 +54,10 @@ namespace cuBQL { } }; -#if CUDART_VERSION >= 11020 +// hipMallocAsync and the hipMemPool API are available on ROCm; on the HIP +// toolchain CUDART_VERSION is undefined (0), so select the async path +// explicitly there. The CUDA arm keeps its >= 11020 floor unchanged. +#if defined(__HIPCC__) || CUDART_VERSION >= 11020 /* Allocator that uses cudaMallocAsync to allocate memory. This can be much faster than cudaMalloc because it doesn't require a device sync for each malloc; but .. CAREFUL: to get memory to be diff --git a/cuBQL/bvh.h b/cuBQL/bvh.h index 86535bc..0c75340 100644 --- a/cuBQL/bvh.h +++ b/cuBQL/bvh.h @@ -145,15 +145,15 @@ namespace cuBQL { using bvh3f = BinaryBVH; using bvh3d = BinaryBVH; -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) typedef BinaryBVH bvh_float2; typedef BinaryBVH bvh_float3; typedef BinaryBVH bvh_float4; #endif - + } // ::cuBQL -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) # include "cuBQL/builder/cuda.h" #endif diff --git a/cuBQL/math/common.h b/cuBQL/math/common.h index 00a1b68..915e2a3 100644 --- a/cuBQL/math/common.h +++ b/cuBQL/math/common.h @@ -28,6 +28,48 @@ # include # include # include +// cuBQL spells the device runtime API as cudaXxx (call sites and the +// CUBQL_CUDA_CALL(call) -> cuda##call token-paste macro). HIP does not +// provide the cudaXxx aliases, so map the exact set cuBQL uses onto the +// hipXxx equivalents. Confined to the __HIPCC__ path so the CUDA build is +// unchanged. +using cudaError_t = hipError_t; +using cudaStream_t = hipStream_t; +using cudaEvent_t = hipEvent_t; +using cudaMemPool_t = hipMemPool_t; +# define cudaSuccess hipSuccess +# define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +# define cudaMemcpyHostToDevice hipMemcpyHostToDevice +# define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +# define cudaMemcpyDefault hipMemcpyDefault +# define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold +# define cudaGetErrorString hipGetErrorString +# define cudaGetLastError hipGetLastError +# define cudaDeviceSynchronize hipDeviceSynchronize +# define cudaStreamSynchronize hipStreamSynchronize +# define cudaGetDevice hipGetDevice +# define cudaSetDevice hipSetDevice +# define cudaGetDeviceCount hipGetDeviceCount +# define cudaMalloc hipMalloc +# define cudaMallocHost hipHostMalloc +# define cudaFree hipFree +# define cudaFreeHost hipHostFree +# define cudaMallocManaged hipMallocManaged +# define cudaMallocAsync hipMallocAsync +# define cudaFreeAsync hipFreeAsync +# define cudaMemcpy hipMemcpy +# define cudaMemcpyAsync hipMemcpyAsync +# define cudaMemcpyFromSymbol hipMemcpyFromSymbol +# define cudaMemcpyToSymbol hipMemcpyToSymbol +# define cudaMemset hipMemset +# define cudaMemsetAsync hipMemsetAsync +# define cudaDeviceGetDefaultMemPool hipDeviceGetDefaultMemPool +# define cudaMemPoolSetAttribute hipMemPoolSetAttribute +# define cudaEventCreate hipEventCreate +# define cudaEventDestroy hipEventDestroy +# define cudaEventRecord hipEventRecord +# define cudaEventSynchronize hipEventSynchronize +# define cudaEventElapsedTime hipEventElapsedTime #elif defined(__CUDACC__) # include // on cuda 13.2/gcc 15.2/ubuntu 25.10 we need to include that before cuda.h gets included diff --git a/cuBQL/math/constants.h b/cuBQL/math/constants.h index 4b5c3c4..d672269 100644 --- a/cuBQL/math/constants.h +++ b/cuBQL/math/constants.h @@ -8,6 +8,23 @@ #if defined(__CUDACC__) && !defined(CUDART_INF_F) #include #endif +#if defined(__HIPCC__) +// CUDA's (the CUDART_* device float constants) has no HIP +// analogue; provide the ones cuBQL uses so the shared device code compiles +// unchanged on the HIP path. +# ifndef CUDART_INF_F +# define CUDART_INF_F __builtin_huge_valf() +# endif +# ifndef CUDART_INF +# define CUDART_INF __builtin_huge_val() +# endif +# ifndef CUDART_NAN_F +# define CUDART_NAN_F __builtin_nanf("") +# endif +# ifndef CUDART_NAN +# define CUDART_NAN __builtin_nan("") +# endif +#endif #ifndef M_PI #define M_PI 3.141593f diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h index 3048657..9259264 100644 --- a/cuBQL/math/vec.h +++ b/cuBQL/math/vec.h @@ -84,10 +84,10 @@ namespace cuBQL { using cuda_t = typename cuda_eq_t::type; inline __cubql_both operator cuda_t() { cuda_t t; t.x = x; t.y = y; return t; } #endif -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) /*! allow to typecast that to a dim3, so it can be used as a cuda kernel launch dim */ inline __cubql_both operator dim3() { dim3 t; t.x = x; t.y = y; t.z = 1; return t; } -#endif +#endif T x, y; }; template @@ -96,10 +96,10 @@ namespace cuBQL { inline __cubql_both T operator[](int i) const { return (i==2)?z:(i?y:x); } inline __cubql_both T &operator[](int i) { return (i==2)?z:(i?y:x); } /*! auto-cast to equivalent cuda type */ -#ifdef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) /*! allow to typecast that to a dim3, so it can be used as a cuda kernel launch dim */ inline __cubql_both operator dim3() { dim3 t; t.x = x; t.y = y; t.z = z; return t; } -#endif +#endif #if CUBQL_SUPPORT_CUDA_VECTOR_TYPES using cuda_t = typename cuda_eq_t::type; inline __cubql_both operator cuda_t() { cuda_t t; t.x = x; t.y = y; t.z = z; return t; } diff --git a/cuBQL/queries/pointData/findClosest.h b/cuBQL/queries/pointData/findClosest.h index dcfed2b..79d5a33 100644 --- a/cuBQL/queries/pointData/findClosest.h +++ b/cuBQL/queries/pointData/findClosest.h @@ -9,6 +9,17 @@ #include "cuBQL/bvh.h" #include "cuBQL/traversal/shrinkingRadiusQuery.h" +// The findClosest* queries are forward-declared __host__ __device__ (so the +// CPU sample can call them) but defined __device__-only. nvcc tolerates that +// decl/def host/device-attribute mismatch; HIP/clang rejects it. On the HIP +// device path align the forward declarations to the device-only definitions. +// Outside HIP (CUDA and the plain-C++ CPU build) the spelling is unchanged. +#if defined(__HIPCC__) +# define __cubql_findClosest_decl __cubql_device +#else +# define __cubql_findClosest_decl __cubql_both +#endif + namespace cuBQL { namespace points { @@ -34,7 +45,7 @@ namespace cuBQL { not) */ template - inline __cubql_both + inline __cubql_findClosest_decl int findClosest(/*! binary bvh built over the given points[] specfied below */ BinaryBVH bvhOverPoints, @@ -65,7 +76,7 @@ namespace cuBQL { not) */ template - inline __cubql_both + inline __cubql_findClosest_decl int findClosest(/*! binary bvh built over the given points[] specfied below */ WideBVH bvhOverPoints, @@ -81,7 +92,7 @@ namespace cuBQL { template - inline __cubql_both + inline __cubql_findClosest_decl int findClosest_exludeID(/*! primitive ID to _exclude_ from queries */ int idOfPointtoExclude, /*! binary bvh built over the given points[] diff --git a/cuBQL/traversal/shrinkingRadiusQuery.h b/cuBQL/traversal/shrinkingRadiusQuery.h index a60e4d6..05948d4 100644 --- a/cuBQL/traversal/shrinkingRadiusQuery.h +++ b/cuBQL/traversal/shrinkingRadiusQuery.h @@ -337,7 +337,8 @@ namespace cuBQL { } } -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // device pass: use the toolchain's __int_as_float/__float_as_int builtins #else inline float __int_as_float(int i) { return (const float &)i; } inline int __float_as_int(float f) { return (const int &)f; } diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 851ef15..d161a16 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,6 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# When building the GPU code with HIP, the sample .cu sources must be compiled +# as HIP. Mark a target's CUDA sources accordingly. +function(cubql_sample_hip_lang target) + if (CUBQL_USE_HIP) + get_target_property(_srcs ${target} SOURCES) + foreach(_s IN LISTS _srcs) + if (_s MATCHES "\\.cu$") + set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP) + endif() + endforeach() + endif() +endfunction() + # common stuff - data generators and IO for loading/generating test data add_subdirectory(common) if (NOT (TARGET stb_image)) @@ -19,10 +32,11 @@ add_subdirectory(s01_closestPoint_points_gpu) # sample that performs gpu-side closest-point queries between sets of # float3 points # ================================================================== -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_executable(sample02_distanceToTriangleMesh s02_distanceToTriangleMesh/distanceToTriangleMesh.cu ) + cubql_sample_hip_lang(sample02_distanceToTriangleMesh) target_link_libraries(sample02_distanceToTriangleMesh # the cuda-side builders for float3 data cuBQL_cuda_float3 @@ -40,10 +54,11 @@ endif() # NxNxN grid. QUeries are performed by tracing axis-aligned rays and # computing cuqbl::crossingCount queries. # ================================================================== -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_executable(sample03_insideOutside s03_insideOutsideOfClosedMesh/insideOutside.cu ) + cubql_sample_hip_lang(sample03_insideOutside) target_link_libraries(sample03_insideOutside # the cuda-side builders for float3 data cuBQL_cuda_float3 @@ -59,10 +74,11 @@ endif() # full inside the surface (0), fully outside (1), or whether it # intersects with one of the triangles of the surface mesh # ================================================================== -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_executable(sample04_boxOverlapsOrInsideSurfaceMesh s04_boxOverlapsOrInsideSurfaceMesh/boxOverlapsOrInsideSurfaceMesh.cu ) + cubql_sample_hip_lang(sample04_boxOverlapsOrInsideSurfaceMesh) target_link_libraries(sample04_boxOverlapsOrInsideSurfaceMesh # the cuda-side builders for float3 data cuBQL_cuda_float3 @@ -79,10 +95,11 @@ endif() # whether the ray did pass through unoccluded (white) or got occluded # (black), then saves that as an image # ================================================================== -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_executable(sample05_lineOfSight s05_lineOfSight/lineOfSight.cu ) + cubql_sample_hip_lang(sample05_lineOfSight) target_link_libraries(sample05_lineOfSight # the cuda-side builders for float3 data cuBQL_cuda_float3 @@ -99,10 +116,11 @@ endif() # if not. Chosen distance for this sample is 1% of scene diameter; # this uses the triangles::anyWithinRadius() query. # ================================================================== -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_executable(sample06_anyTriangleWithinRadius s06_anyTriangleWithinRadius/anyTriangleWithinRadius.cu ) + cubql_sample_hip_lang(sample06_anyTriangleWithinRadius) target_link_libraries(sample06_anyTriangleWithinRadius # the cuda-side builders for float3 data cuBQL_cuda_float3 @@ -112,10 +130,11 @@ if (CUBQL_HAVE_CUDA) ) endif() -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) add_executable(sample07_aggregateNBody s07_aggregateNBody/aggregateNBody.cu ) + cubql_sample_hip_lang(sample07_aggregateNBody) target_link_libraries(sample07_aggregateNBody # the cuda-side builders for float3 data cuBQL_cuda_float3 diff --git a/samples/s01_closestPoint_points_gpu/CMakeLists.txt b/samples/s01_closestPoint_points_gpu/CMakeLists.txt index fa956c2..1f98de1 100644 --- a/samples/s01_closestPoint_points_gpu/CMakeLists.txt +++ b/samples/s01_closestPoint_points_gpu/CMakeLists.txt @@ -1,7 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -if (CUBQL_HAVE_CUDA) +if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP) + if (CUBQL_USE_HIP) + set_source_files_properties(closestPoint.cu closestPoint_WideBVH.cu + PROPERTIES LANGUAGE HIP) + endif() add_executable(cuBQL_sample01_points_closestPoint_cuda closestPoint.cu )