Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,27 @@ cmake_policy(SET CMP0048 NEW)
set(CMAKE_BUILD_TYPE_INIT "Release")
project(cuBQL VERSION ${CUBQL_VERSION} LANGUAGES C CXX)

# Build the GPU code with HIP for AMD GPUs instead of CUDA. When ON, the .cu
# sources are compiled as HIP (LANGUAGE HIP) and CUBQL_HAVE_HIP is set; the
# per-type instantiation targets and GPU samples are built the same way as for
# CUDA, gated on (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP).
option(CUBQL_USE_HIP "Build cuBQL GPU code with HIP for AMD GPUs" OFF)

# CUBQL_HAVE_CUDA / CUBQL_HAVE_HIP record which (if any) GPU toolchain is in
# use; the GPU instantiation targets and samples below are gated on either.
set(CUBQL_HAVE_CUDA OFF)
set(CUBQL_HAVE_HIP OFF)
if (CUBQL_OMP)
set(CUBQL_DISABLE_CUDA ON)
endif()
if (CUBQL_DISABLE_CUDA)
if (CUBQL_USE_HIP)
message("#cuBQL: building GPU code with HIP for AMD GPUs")
enable_language(HIP)
if (NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
set(CMAKE_HIP_ARCHITECTURES "gfx90a")
endif()
set(CUBQL_HAVE_HIP ON)
elseif (CUBQL_DISABLE_CUDA)
message("#cuBQL: CUDA _DISABLED_ by user request")
set(CUBQL_HAVE_CUDA OFF)
else()
Expand Down Expand Up @@ -95,7 +112,10 @@ if (CUBQL_IS_SUBPROJECT)
)
endif()
else()
if (CMAKE_CUDA_ARCHITECTURES)
if (CUBQL_USE_HIP)
# HIP build: the arch lives in CMAKE_HIP_ARCHITECTURES (defaulted above);
# do not derive or require a CUDA arch.
elseif (CMAKE_CUDA_ARCHITECTURES)
# set on the cmdline, for a local build
set(CUBQL_CUDA_ARCHITECTURES_INIT "${CMAKE_CUDA_ARCHITECTURES}")
elseif (${CMAKE_VERSION} VERSION_LESS 3.24)
Expand All @@ -119,10 +139,12 @@ else()
else()
set(CUBQL_CUDA_ARCHITECTURES_INIT "native")
endif()
set(CUBQL_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES_INIT}" CACHE STRING
"Which CUDA architecture to build for")
set(CMAKE_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES}")

if (NOT CUBQL_USE_HIP)
set(CUBQL_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES_INIT}" CACHE STRING
"Which CUDA architecture to build for")
set(CMAKE_CUDA_ARCHITECTURES "${CUBQL_CUDA_ARCHITECTURES}")
endif()

if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
else()
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,12 @@ However, we strongly suggest to use `cmake`, include cuBQL as a cmake
`add_subdirectory(...)`, and then `target_link_libraries(...)` with
the desired cuBQL cmake target.

cuBQL also builds for AMD GPUs through ROCm/HIP. Configure with
`-DCUBQL_USE_HIP=ON` (and set `CMAKE_HIP_ARCHITECTURES` to the target
GPU, for example `gfx90a`) to compile the GPU builders and traversers
with `hipcc` instead of `nvcc`; the same headers and cmake targets are
used. Without `CUBQL_USE_HIP` the build is the usual CUDA build.

## Building in Header-only (explicit instantiation) mode:

- in your own CUDA sources (say, `userMain.cu`):
Expand Down
10 changes: 9 additions & 1 deletion cuBQL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ function(add_specific_instantiation device suffix T D)
PUBLIC
-DCUBQL_HAVE_CUDA=1
)
if (CUBQL_USE_HIP)
set_source_files_properties(builder/${device}/instantiate_builders.${suffix}
PROPERTIES LANGUAGE HIP)
endif()
endif()
set_target_properties(cuBQL_${device}_${T}${D}
PROPERTIES
Expand All @@ -91,6 +95,10 @@ function(add_specific_instantiation device suffix T D)
PUBLIC
-DCUBQL_HAVE_CUDA=1
)
if (CUBQL_USE_HIP)
set_source_files_properties(builder/${device}/instantiate_builders.${suffix}
PROPERTIES LANGUAGE HIP)
endif()
endif()
set_target_properties(cuBQL_${device}_${T}${D}_static
PROPERTIES
Expand Down Expand Up @@ -130,7 +138,7 @@ endfunction()
foreach(T IN ITEMS float int double longlong)
foreach(D IN ITEMS 2 3 4)
add_specific_instantiation(cpu cpp ${T} ${D})
if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_specific_instantiation(cuda cu ${T} ${D})
endif()
endforeach()
Expand Down
5 changes: 4 additions & 1 deletion cuBQL/builder/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ namespace cuBQL {
}
};

#if CUDART_VERSION >= 11020
// hipMallocAsync and the hipMemPool API are available on ROCm; on the HIP
// toolchain CUDART_VERSION is undefined (0), so select the async path
// explicitly there. The CUDA arm keeps its >= 11020 floor unchanged.
#if defined(__HIPCC__) || CUDART_VERSION >= 11020
/* Allocator that uses cudaMallocAsync to allocate memory. This can
be much faster than cudaMalloc because it doesn't require a
device sync for each malloc; but .. CAREFUL: to get memory to be
Expand Down
6 changes: 3 additions & 3 deletions cuBQL/bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,15 @@ namespace cuBQL {
using bvh3f = BinaryBVH<float,3>;
using bvh3d = BinaryBVH<double,3>;

#ifdef __CUDACC__
#if defined(__CUDACC__) || defined(__HIPCC__)
typedef BinaryBVH<float,2> bvh_float2;
typedef BinaryBVH<float,3> bvh_float3;
typedef BinaryBVH<float,4> bvh_float4;
#endif

} // ::cuBQL

#ifdef __CUDACC__
#if defined(__CUDACC__) || defined(__HIPCC__)
# include "cuBQL/builder/cuda.h"
#endif

Expand Down
42 changes: 42 additions & 0 deletions cuBQL/math/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,48 @@
# include <hip/hip_runtime.h>
# include <hip/driver_types.h>
# include <hip/hip_runtime.h>
// cuBQL spells the device runtime API as cudaXxx (call sites and the
// CUBQL_CUDA_CALL(call) -> cuda##call token-paste macro). HIP does not
// provide the cudaXxx aliases, so map the exact set cuBQL uses onto the
// hipXxx equivalents. Confined to the __HIPCC__ path so the CUDA build is
// unchanged.
using cudaError_t = hipError_t;
using cudaStream_t = hipStream_t;
using cudaEvent_t = hipEvent_t;
using cudaMemPool_t = hipMemPool_t;
# define cudaSuccess hipSuccess
# define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
# define cudaMemcpyHostToDevice hipMemcpyHostToDevice
# define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
# define cudaMemcpyDefault hipMemcpyDefault
# define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
# define cudaGetErrorString hipGetErrorString
# define cudaGetLastError hipGetLastError
# define cudaDeviceSynchronize hipDeviceSynchronize
# define cudaStreamSynchronize hipStreamSynchronize
# define cudaGetDevice hipGetDevice
# define cudaSetDevice hipSetDevice
# define cudaGetDeviceCount hipGetDeviceCount
# define cudaMalloc hipMalloc
# define cudaMallocHost hipHostMalloc
# define cudaFree hipFree
# define cudaFreeHost hipHostFree
# define cudaMallocManaged hipMallocManaged
# define cudaMallocAsync hipMallocAsync
# define cudaFreeAsync hipFreeAsync
# define cudaMemcpy hipMemcpy
# define cudaMemcpyAsync hipMemcpyAsync
# define cudaMemcpyFromSymbol hipMemcpyFromSymbol
# define cudaMemcpyToSymbol hipMemcpyToSymbol
# define cudaMemset hipMemset
# define cudaMemsetAsync hipMemsetAsync
# define cudaDeviceGetDefaultMemPool hipDeviceGetDefaultMemPool
# define cudaMemPoolSetAttribute hipMemPoolSetAttribute
# define cudaEventCreate hipEventCreate
# define cudaEventDestroy hipEventDestroy
# define cudaEventRecord hipEventRecord
# define cudaEventSynchronize hipEventSynchronize
# define cudaEventElapsedTime hipEventElapsedTime
#elif defined(__CUDACC__)
# include <cuda_runtime.h>
// on cuda 13.2/gcc 15.2/ubuntu 25.10 we need to include that before cuda.h gets included
Expand Down
17 changes: 17 additions & 0 deletions cuBQL/math/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@
#if defined(__CUDACC__) && !defined(CUDART_INF_F)
#include <math_constants.h>
#endif
#if defined(__HIPCC__)
// CUDA's <math_constants.h> (the CUDART_* device float constants) has no HIP
// analogue; provide the ones cuBQL uses so the shared device code compiles
// unchanged on the HIP path.
# ifndef CUDART_INF_F
# define CUDART_INF_F __builtin_huge_valf()
# endif
# ifndef CUDART_INF
# define CUDART_INF __builtin_huge_val()
# endif
# ifndef CUDART_NAN_F
# define CUDART_NAN_F __builtin_nanf("")
# endif
# ifndef CUDART_NAN
# define CUDART_NAN __builtin_nan("")
# endif
#endif

#ifndef M_PI
#define M_PI 3.141593f
Expand Down
8 changes: 4 additions & 4 deletions cuBQL/math/vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ namespace cuBQL {
using cuda_t = typename cuda_eq_t<T,2>::type;
inline __cubql_both operator cuda_t() { cuda_t t; t.x = x; t.y = y; return t; }
#endif
#ifdef __CUDACC__
#if defined(__CUDACC__) || defined(__HIPCC__)
/*! allow to typecast that to a dim3, so it can be used as a cuda kernel launch dim */
inline __cubql_both operator dim3() { dim3 t; t.x = x; t.y = y; t.z = 1; return t; }
#endif
#endif
T x, y;
};
template<typename T>
Expand All @@ -96,10 +96,10 @@ namespace cuBQL {
inline __cubql_both T operator[](int i) const { return (i==2)?z:(i?y:x); }
inline __cubql_both T &operator[](int i) { return (i==2)?z:(i?y:x); }
/*! auto-cast to equivalent cuda type */
#ifdef __CUDACC__
#if defined(__CUDACC__) || defined(__HIPCC__)
/*! allow to typecast that to a dim3, so it can be used as a cuda kernel launch dim */
inline __cubql_both operator dim3() { dim3 t; t.x = x; t.y = y; t.z = z; return t; }
#endif
#endif
#if CUBQL_SUPPORT_CUDA_VECTOR_TYPES
using cuda_t = typename cuda_eq_t<T,3>::type;
inline __cubql_both operator cuda_t() { cuda_t t; t.x = x; t.y = y; t.z = z; return t; }
Expand Down
17 changes: 14 additions & 3 deletions cuBQL/queries/pointData/findClosest.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@
#include "cuBQL/bvh.h"
#include "cuBQL/traversal/shrinkingRadiusQuery.h"

// The findClosest* queries are forward-declared __host__ __device__ (so the
// CPU sample can call them) but defined __device__-only. nvcc tolerates that
// decl/def host/device-attribute mismatch; HIP/clang rejects it. On the HIP
// device path align the forward declarations to the device-only definitions.
// Outside HIP (CUDA and the plain-C++ CPU build) the spelling is unchanged.
#if defined(__HIPCC__)
# define __cubql_findClosest_decl __cubql_device
#else
# define __cubql_findClosest_decl __cubql_both
#endif

namespace cuBQL {
namespace points {

Expand All @@ -34,7 +45,7 @@ namespace cuBQL {
not)
*/
template<typename T, int D>
inline __cubql_both
inline __cubql_findClosest_decl
int findClosest(/*! binary bvh built over the given points[]
specfied below */
BinaryBVH<T,D> bvhOverPoints,
Expand Down Expand Up @@ -65,7 +76,7 @@ namespace cuBQL {
not)
*/
template<typename T, int D, int W>
inline __cubql_both
inline __cubql_findClosest_decl
int findClosest(/*! binary bvh built over the given points[]
specfied below */
WideBVH<T,D,W> bvhOverPoints,
Expand All @@ -81,7 +92,7 @@ namespace cuBQL {


template<typename T, int D>
inline __cubql_both
inline __cubql_findClosest_decl
int findClosest_exludeID(/*! primitive ID to _exclude_ from queries */
int idOfPointtoExclude,
/*! binary bvh built over the given points[]
Expand Down
3 changes: 2 additions & 1 deletion cuBQL/traversal/shrinkingRadiusQuery.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,8 @@ namespace cuBQL {
}
}

#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
// device pass: use the toolchain's __int_as_float/__float_as_int builtins
#else
inline float __int_as_float(int i) { return (const float &)i; }
inline int __float_as_int(float f) { return (const int &)f; }
Expand Down
31 changes: 25 additions & 6 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# When building the GPU code with HIP, the sample .cu sources must be compiled
# as HIP. Mark a target's CUDA sources accordingly.
function(cubql_sample_hip_lang target)
if (CUBQL_USE_HIP)
get_target_property(_srcs ${target} SOURCES)
foreach(_s IN LISTS _srcs)
if (_s MATCHES "\\.cu$")
set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP)
endif()
endforeach()
endif()
endfunction()

# common stuff - data generators and IO for loading/generating test data
add_subdirectory(common)
if (NOT (TARGET stb_image))
Expand All @@ -19,10 +32,11 @@ add_subdirectory(s01_closestPoint_points_gpu)
# sample that performs gpu-side closest-point queries between sets of
# float3 points
# ==================================================================
if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_executable(sample02_distanceToTriangleMesh
s02_distanceToTriangleMesh/distanceToTriangleMesh.cu
)
cubql_sample_hip_lang(sample02_distanceToTriangleMesh)
target_link_libraries(sample02_distanceToTriangleMesh
# the cuda-side builders for float3 data
cuBQL_cuda_float3
Expand All @@ -40,10 +54,11 @@ endif()
# NxNxN grid. QUeries are performed by tracing axis-aligned rays and
# computing cuqbl::crossingCount queries.
# ==================================================================
if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_executable(sample03_insideOutside
s03_insideOutsideOfClosedMesh/insideOutside.cu
)
cubql_sample_hip_lang(sample03_insideOutside)
target_link_libraries(sample03_insideOutside
# the cuda-side builders for float3 data
cuBQL_cuda_float3
Expand All @@ -59,10 +74,11 @@ endif()
# full inside the surface (0), fully outside (1), or whether it
# intersects with one of the triangles of the surface mesh
# ==================================================================
if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_executable(sample04_boxOverlapsOrInsideSurfaceMesh
s04_boxOverlapsOrInsideSurfaceMesh/boxOverlapsOrInsideSurfaceMesh.cu
)
cubql_sample_hip_lang(sample04_boxOverlapsOrInsideSurfaceMesh)
target_link_libraries(sample04_boxOverlapsOrInsideSurfaceMesh
# the cuda-side builders for float3 data
cuBQL_cuda_float3
Expand All @@ -79,10 +95,11 @@ endif()
# whether the ray did pass through unoccluded (white) or got occluded
# (black), then saves that as an image
# ==================================================================
if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_executable(sample05_lineOfSight
s05_lineOfSight/lineOfSight.cu
)
cubql_sample_hip_lang(sample05_lineOfSight)
target_link_libraries(sample05_lineOfSight
# the cuda-side builders for float3 data
cuBQL_cuda_float3
Expand All @@ -99,10 +116,11 @@ endif()
# if not. Chosen distance for this sample is 1% of scene diameter;
# this uses the triangles::anyWithinRadius() query.
# ==================================================================
if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_executable(sample06_anyTriangleWithinRadius
s06_anyTriangleWithinRadius/anyTriangleWithinRadius.cu
)
cubql_sample_hip_lang(sample06_anyTriangleWithinRadius)
target_link_libraries(sample06_anyTriangleWithinRadius
# the cuda-side builders for float3 data
cuBQL_cuda_float3
Expand All @@ -112,10 +130,11 @@ if (CUBQL_HAVE_CUDA)
)
endif()

if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
add_executable(sample07_aggregateNBody
s07_aggregateNBody/aggregateNBody.cu
)
cubql_sample_hip_lang(sample07_aggregateNBody)
target_link_libraries(sample07_aggregateNBody
# the cuda-side builders for float3 data
cuBQL_cuda_float3
Expand Down
6 changes: 5 additions & 1 deletion samples/s01_closestPoint_points_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

if (CUBQL_HAVE_CUDA)
if (CUBQL_HAVE_CUDA OR CUBQL_HAVE_HIP)
if (CUBQL_USE_HIP)
set_source_files_properties(closestPoint.cu closestPoint_WideBVH.cu
PROPERTIES LANGUAGE HIP)
endif()
add_executable(cuBQL_sample01_points_closestPoint_cuda
closestPoint.cu
)
Expand Down