diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml index 17bdf08..c03ae67 100644 --- a/.github/workflows/Windows.yml +++ b/.github/workflows/Windows.yml @@ -23,26 +23,16 @@ jobs: visual-studio: "Visual Studio 17 2022" shell: "powershell" config: Release - - os: windows-2022 - cuda: "13.0.0" - visual-studio: "Visual Studio 17 2022" - shell: "powershell" - config: Release - - os: windows-2025 - cuda: "12.6.2" - visual-studio: "Visual Studio 17 2022" - shell: "powershell" - config: Release - os: windows-2025 - cuda: "13.0.0" - visual-studio: "Visual Studio 17 2022" + cuda: "13.2.0" + visual-studio: "Visual Studio 18 2026" shell: "powershell" config: Release - os: windows-2025 - cuda: "13.0.2" - visual-studio: "Visual Studio 17 2022" + cuda: "13.2.0" + visual-studio: "Visual Studio 18 2026" shell: "powershell" - config: Release + config: Debug env: cmake_generator: "${{ matrix.visual-studio }}" diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt index 3a22714..5f2c7d8 100644 --- a/cuBQL/CMakeLists.txt +++ b/cuBQL/CMakeLists.txt @@ -6,6 +6,15 @@ # in one of his/her source files add_library(cuBQL INTERFACE) +if (WIN32) + target_compile_options(cuBQL INTERFACE + $<$:-Xcompiler="/Zc:preprocessor"> + ) +# target_compile_options(${PTX_TARGET} PRIVATE +# $<$:-std=c++17> +# ) +endif() + target_sources(cuBQL INTERFACE # main public "interface" to this library bvh.h diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h index b72d61e..ace5e24 100644 --- a/cuBQL/builder/cpu/spatialMedian.h +++ b/cuBQL/builder/cpu/spatialMedian.h @@ -132,6 +132,21 @@ namespace cuBQL { if (box.empty()) continue; primIDs.push_back(i); } + + if (primIDs.empty()) { + // if we had no valid input prims whatsoever + bvh.nodes = new typename BinaryBVH::Node[1]; + bvh.nodes[0].bounds = box3f(); + bvh.nodes[0].admin.offset = 0; + bvh.nodes[0].admin.count = 1; + bvh.primIDs = new uint32_t[numPrims]; + for (int i=0;i altPrimIDs(primIDs.size()); std::vector topo(1); diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h index f15dad8..061091b 100644 --- a/cuBQL/builder/cuda.h +++ b/cuBQL/builder/cuda.h @@ -45,6 +45,7 @@ namespace cuBQL { DeviceMemoryResource() {} void malloc(void** ptr, size_t size, cudaStream_t s) override { + CUBQL_CUDA_CALL(StreamSynchronize(s)); CUBQL_CUDA_CALL(Malloc(ptr, size)); } void free(void* ptr, cudaStream_t s) override @@ -76,7 +77,10 @@ namespace cuBQL { cudaMemPool_t mempool; CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice)); uint64_t threshold = UINT64_MAX; - CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold)); + CUBQL_CUDA_CALL + (MemPoolSetAttribute(mempool, + cudaMemPoolAttrReleaseThreshold, + &threshold)); } } ); } diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h index e71b901..63a9a33 100644 --- a/cuBQL/builder/cuda/builder_common.h +++ b/cuBQL/builder/cuda/builder_common.h @@ -98,6 +98,7 @@ namespace cuBQL { if (bits & sign) bits ^= 0x7fffffff; return bits; } + template<> inline __device__ int64_t encode(double f) { @@ -133,9 +134,11 @@ namespace cuBQL { if (bits & sign) bits ^= 0x7fffffff; return __int_as_float(bits); } + template<> inline __device__ int32_t decode(int32_t bits) { return bits; } + template<> inline __device__ int64_t decode(int64_t bits) { return bits; } @@ -225,23 +228,24 @@ namespace cuBQL { } template inline __device__ - void atomic_grow(AtomicBox &abox, const typename box_t::vec_t &other) + void atomic_grow(AtomicBox &abox, + typename box_t::vec_t other) { using scalar_t = typename AtomicBox::scalar_t; #pragma unroll for (int d=0;d::type enc - = //AtomicBox:: - encode(other[d]);//get(other,d)); + = encode(other[d]); if (enc < abox.lower[d]) - atomic_min(&abox.lower[d],enc); + atomic_min(abox.lower+d,enc); if (enc > abox.upper[d]) - atomic_max(&abox.upper[d],enc); + atomic_max(abox.upper+d,enc); } } template - inline __device__ void atomic_grow(AtomicBox &abox, const box_t &other) + inline __device__ + void atomic_grow(AtomicBox &abox, box_t other) { using scalar_t = typename AtomicBox::scalar_t; #pragma unroll @@ -252,13 +256,14 @@ namespace cuBQL { const typename int_type_of::type enc_upper = //AtomicBox:: encode(other.get_upper(d)); - if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower); - if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper); + if (enc_lower < abox.lower[d]) atomic_min(abox.lower+d,enc_lower); + if (enc_upper > abox.upper[d]) atomic_max(abox.upper+d,enc_upper); } } template - inline __device__ void atomic_grow(AtomicBox &abox, const AtomicBox &other) + inline __device__ void atomic_grow(AtomicBox &abox, + const AtomicBox &other) { using scalar_t = typename AtomicBox::scalar_t; #pragma unroll diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h index 59395cd..cf68ada 100644 --- a/cuBQL/builder/cuda/sm_builder.h +++ b/cuBQL/builder/cuda/sm_builder.h @@ -25,13 +25,14 @@ namespace cuBQL { }; template - struct CUBQL_ALIGN(16) TempNode { + struct CUBQL_ALIGN(16) + TempNode { using box_t = cuBQL::box_t; union { struct { - AtomicBox centBounds; uint32_t count; uint32_t unused; + AtomicBox centBounds; } openBranch; struct { uint32_t offset; @@ -42,7 +43,6 @@ namespace cuBQL { struct { uint32_t offset; uint32_t count; - uint32_t unused[2]; } doneNode; }; }; @@ -82,7 +82,8 @@ namespace cuBQL { me.done = false; // this could be made faster by block-reducing ... atomicAdd(&nodes[0].openBranch.count,1); - atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box)); + auto ctr = box.center(); + atomic_grow(nodes[0].openBranch.centBounds,ctr); } else { me.nodeID = (uint32_t)-1; me.done = true; @@ -208,13 +209,14 @@ namespace cuBQL { auto in = nodes[nodeID].openBranch; if (in.count <= buildConfig.makeLeafThreshold) { - auto &done = nodes[nodeID].doneNode; + auto done = nodes[nodeID].doneNode; done.count = in.count; // set this to max-value, so the prims can later do atomicMin // with their position ion the leaf list; this value is // greater than any prim position. done.offset = (uint32_t)-1; nodeState = DONE_NODE; + nodes[nodeID].doneNode = done; } else { float widestWidth = 0.f; int widestDim = -1; @@ -235,7 +237,8 @@ namespace cuBQL { widestCtr = ctr; } - auto &open = nodes[nodeID].openNode; + // auto &open = nodes[nodeID].openNode; + auto open = nodes[nodeID].openNode; if (widestDim >= 0) { open.pos = widestCtr; } @@ -249,12 +252,15 @@ namespace cuBQL { #pragma unroll for (int side=0;side<2;side++) { const int childID = open.offset+side; - auto &child = nodes[childID].openBranch; - child.centBounds.set_empty(); - child.count = 0; + TempNode child; + // auto &child = nodes[childID].openBranch; + child.openBranch.centBounds.set_empty(); + child.openBranch.count = 0; nodeStates[childID] = OPEN_BRANCH; + nodes[childID] = child; } nodeState = OPEN_NODE; + nodes[nodeID].openNode = open; } #endif } @@ -420,7 +426,6 @@ namespace cuBQL { GpuMemoryResource &memResource) { assert(sizeof(PrimState) == sizeof(uint64_t)); - // ================================================================== // do build on temp nodes // ================================================================== @@ -432,10 +437,12 @@ namespace cuBQL { _ALLOC(nodeStates,2*numPrims,s,memResource); _ALLOC(primStates,numPrims,s,memResource); _ALLOC(buildState,1,s,memResource); - initState<<<1,1,0,s>>>(buildState, - nodeStates, - tempNodes); - initPrims<<>> + + initState<<<1,128,0,s>>>(buildState, + nodeStates, + tempNodes); + + initPrims<<>> (tempNodes, primStates,boxes,numPrims); @@ -445,61 +452,30 @@ namespace cuBQL { // ------------------------------------------------------------------ cudaEvent_t stateDownloadedEvent; CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent)); - - -#if CUBQL_PROFILE - int pass = 0; - static Profile t_writeNodes; - static Profile t_writePrims; - static Profile t_sortPrims; - static Profile t_nodePass[100]; - static Profile t_primPass[100]; - if (t_writeNodes.name == "") { - t_writeNodes.setName("writeNodes"); - t_writePrims.setName("writePrims"); - t_sortPrims.setName("sortPrims"); - for (int i=0;i<100;i++) { - t_nodePass[i].setName("nodePass",i); - t_primPass[i].setName("primPass",i); - } - } -#endif while (true) { CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes, sizeof(numNodes),cudaMemcpyDeviceToHost,s)); - if (numNodes == numDone) - break; CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); -#if CUBQL_PROFILE - t_nodePass[pass].sync_start(); -#endif + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + if (numNodes == numDone) + break; + selectSplits<<>> (buildState, nodeStates,tempNodes,numNodes, buildConfig); -#if CUBQL_PROFILE - t_nodePass[pass].sync_stop(); - t_primPass[pass].sync_start(); -#endif + numDone = numNodes; -// #if 1 - if (sizeof(T)*D <= sizeof(float3)) { + if (1 && sizeof(T)*D <= sizeof(float3)) { updatePrims_shm<<>> (nodeStates,tempNodes, primStates,boxes,numPrims,numDone); } else -// #else - updatePrims<<>> + updatePrims<<>> (nodeStates,tempNodes, primStates,boxes,numPrims); -// #endif - -#if CUBQL_PROFILE - t_primPass[pass].sync_stop(); - ++ pass; -#endif } CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent)); // ================================================================== @@ -510,9 +486,6 @@ namespace cuBQL { uint8_t *d_temp_storage = NULL; size_t temp_storage_bytes = 0; PrimState *sortedPrimStates = 0; -#if CUBQL_PROFILE - t_sortPrims.sync_start(); -#endif _ALLOC(sortedPrimStates,numPrims,s,memResource); auto rc = cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, @@ -527,10 +500,6 @@ namespace cuBQL { numPrims,32,64,s); rc = rc; _FREE(d_temp_storage,s,memResource); -#if CUBQL_PROFILE - t_sortPrims.sync_stop(); - t_writePrims.sync_start(); -#endif // ================================================================== // allocate and write BVH item list, and write offsets of leaf nodes // ================================================================== @@ -539,10 +508,6 @@ namespace cuBQL { _ALLOC(bvh.primIDs,numPrims,s,memResource); writePrimsAndLeafOffsets<<>> (tempNodes,bvh.primIDs,sortedPrimStates,numPrims); -#if CUBQL_PROFILE - t_writePrims.sync_stop(); - t_writeNodes.sync_start(); -#endif // ================================================================== // allocate and write final nodes @@ -551,9 +516,6 @@ namespace cuBQL { _ALLOC(bvh.nodes,numNodes,s,memResource); writeNodes<<>> (bvh.nodes,tempNodes,numNodes); -#if CUBQL_PROFILE - t_writeNodes.sync_stop(); -#endif _FREE(sortedPrimStates,s,memResource); _FREE(tempNodes,s,memResource); _FREE(nodeStates,s,memResource); diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h index 798d037..8c73416 100644 --- a/cuBQL/math/affine.h +++ b/cuBQL/math/affine.h @@ -181,6 +181,25 @@ namespace cuBQL { const typename AffineSpaceT::vector_t& n) { return xfmNormal(m.l,n); } + template + box_t xfmBox(const AffineSpaceT>> &xfm, + const box_t &in) + { + box_t out; + if (in.empty()) return out; + auto l = in.lower; + auto u = in.upper; + out.extend(xfmPoint(xfm,vec_t(l.x,l.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(l.x,l.y,u.z))); + out.extend(xfmPoint(xfm,vec_t(l.x,u.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(l.x,u.y,u.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,l.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,l.y,u.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,u.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,u.y,u.z))); + return out; + } + //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators diff --git a/cuBQL/math/box.h b/cuBQL/math/box.h index 07736c4..a26f49f 100644 --- a/cuBQL/math/box.h +++ b/cuBQL/math/box.h @@ -79,7 +79,8 @@ namespace cuBQL { /*! create a box from two points. note this will NOT make sure that a &a, const vec_t_data &b) + inline __cubql_both box_t(const vec_t_data &a, + const vec_t_data &b) { lower = vec_t(a); upper = vec_t(b); @@ -88,11 +89,23 @@ namespace cuBQL { /*! returns a box that bounds both 'this' and another point 'v'; this does not get modified */ inline __cubql_both box_t including(const vec_t &v) const - { return box_t{min(lower,v),max(upper,v)}; } + { + // return box_t(min(lower,v),max(upper,v)); + box_t b; + b.lower = min(this->lower,v); + b.upper = max(this->upper,v); + return b; + } /*! returns a box that bounds both 'this' and another box 'b'; this does not get modified */ inline __cubql_both box_t including(const box_t &b) const - { return box_t{min(lower,b.lower),max(upper,b.upper)}; } + { + box_t bb; + bb.lower = min(this->lower,b.lower); + bb.upper = max(this->upper,b.upper); + return bb; + // return box_t(min(lower,b.lower),max(upper,b.upper)); + } inline __cubql_both box_t &grow(const vec_t &v) { lower = min(lower,v); upper = max(upper,v); return *this; } diff --git a/cuBQL/math/common.h b/cuBQL/math/common.h index a3063dd..00a1b68 100644 --- a/cuBQL/math/common.h +++ b/cuBQL/math/common.h @@ -336,8 +336,8 @@ namespace cuBQL { #define CUBQL_CUDA_SYNC_CHECK() \ { \ - cudaDeviceSynchronize(); \ - cudaError_t rc = cudaGetLastError(); \ + cudaError_t rc = cudaDeviceSynchronize(); \ + rc = cudaGetLastError(); \ if (rc != cudaSuccess) { \ fprintf(stderr, "error (%s: line %d): %s\n", \ __FILE__, __LINE__, cudaGetErrorString(rc)); \ diff --git a/cuBQL/math/math.h b/cuBQL/math/math.h index 6943b75..86855fb 100644 --- a/cuBQL/math/math.h +++ b/cuBQL/math/math.h @@ -4,14 +4,15 @@ #pragma once #include "cuBQL/math/common.h" -#ifdef __CUDACC__ -#include -#endif +// #ifdef __CUDACC__ +// #include +// #endif #include namespace cuBQL { -#ifdef __CUDACC__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// #if defined(__CUDACC__) || defined(__HIPCC__) // make sure we use the built-in cuda functoins that use floats, not // the c-stdlib ones that use doubles. using ::min; @@ -21,14 +22,15 @@ namespace cuBQL { using std::max; #endif -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// #if defined(__CUDA_ARCH__) || defined(__HIPCC__) # define CUBQL_INF CUDART_INF_F // # define CUBQL_INF ::cuda::std::numeric_limits::infinity() #else # define CUBQL_INF std::numeric_limits::infinity() #endif -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) #else inline __cubql_both float __int_as_float(int i) { return (const float &)i; } inline __cubql_both int __float_as_int(float f) { return (const int &)f; } diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h index 9f7a9b4..3048657 100644 --- a/cuBQL/math/vec.h +++ b/cuBQL/math/vec.h @@ -7,7 +7,7 @@ #include #include #include "constants.h" -#ifdef __CUDACC__ +#if defined(__CUDACC__) && !defined(CUDART_INF) # include #endif @@ -19,10 +19,10 @@ namespace cuBQL { -#ifndef __CUDACC__ - using std::min; - using std::max; -#endif +// #ifndef __CUDA_ARCH__ +// using std::min; +// using std::max; +// #endif #ifndef CUBQL_SUPPORT_CUDA_VECTOR_TYPES #define CUBQL_SUPPORT_CUDA_VECTOR_TYPES 0 @@ -46,7 +46,8 @@ namespace cuBQL { equivalent, let's also create a 'invalid_t' to be used by default */ -#ifndef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) +#else struct float2 { float x, y; }; struct float3 { float x, y, z; }; struct CUBQL_ALIGN(16) float4 { float x, y, z, w; }; @@ -492,18 +493,21 @@ namespace cuBQL { { return vec_t( v.x >> b, v.y >> b, v.z >> b, v.w >> b ); } inline __cubql_both double abs(double d) { -#ifdef __CUDA_ARCH__ +// #ifdef __CUDA_ARCH__ return ::abs(d); -#else - return std::abs(d); -#endif +// #else +// return std::abs(d); +// #endif } inline __cubql_both float abs(float d) { -#ifdef __CUDA_ARCH__ - return ::abs(d); +#if defined(__CUDA_ARCH__) || defined(__CUDACC__) + return abs(d); #else return std::abs(d); #endif +// #else +// return std::abs(d); +// #endif } // inline __cubql_both double abs(double d) { return absf(d); } @@ -711,7 +715,8 @@ namespace cuBQL { template inline __cubql_both - vec_t normalize(vec_t v) { return v * (T(1)/sqrt(dot(v,v))); } + vec_t normalize(vec_t v) { return v * (T(1)/::sqrt(dot(v,v))); } + // vec_t normalize(vec_t v) { return v * (T(1)/sqrt(dot(v,v))); } // ------------------------------------------------------------------ diff --git a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h index bc20150..44a7683 100644 --- a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h +++ b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h @@ -40,7 +40,7 @@ namespace cuBQL { #else auto dbg = _dbg; #endif - if (dbg) /* nothing, just for 'maybe_unused' */; + if (dbg) {/* nothing, just for 'maybe_unused' */}; using vec3 = vec_t; const vec3 v0(tri.a); diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h index 2baa33e..1a983f5 100644 --- a/cuBQL/traversal/rayQueries.h +++ b/cuBQL/traversal/rayQueries.h @@ -705,11 +705,11 @@ namespace cuBQL { ray_t &ray, bool _dbg) { -#ifdef NDEBUG +// #ifdef NDEBUG const bool dbg = false; -#else - bool dbg = _dbg; -#endif +// #else +// bool dbg = _dbg; +// #endif using bvh_t = BinaryBVH; using node_t = typename bvh_t::node_t; using scalar_t = typename bvh_t::scalar_t; @@ -735,7 +735,7 @@ namespace cuBQL { if (ray.direction.y == (scalar_t)0) ray.direction.y = scalar_t(1e-20); if (ray.direction.z == (scalar_t)0) ray.direction.z = scalar_t(1e-20); vec3_t rcp_dir = rcp(ray.direction); - + // ------------------------------------------------------------------ // traverse until there's nothing left to traverse: // ------------------------------------------------------------------ @@ -750,6 +750,7 @@ namespace cuBQL { while (true) { // if (dbg) printf("node %i.%i\n",(int)node.offset,(int)node.count); if (node.count != 0) { + // return;//xx // it's a boy! - seriously: this is not a inner node; so // we're either at a final leaf, or at an instance node if (blasStackBase != nullptr) @@ -760,8 +761,8 @@ namespace cuBQL { tlasSavedNodePtr = bvh.nodes; tlasSavedPrimIDs = bvh.primIDs; #ifndef NDEBUG - if (node.count != 1) - printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n"); + // if (node.count != 1) + // printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n"); #endif // if (dbg) // printf("inner-leaf primIDs %p ofs %i count %i\n", @@ -798,6 +799,7 @@ namespace cuBQL { // if (dbg) printf("new node %i.%i\n",(int)node.offset,(int)node.count); } + uint32_t n0Idx = (uint32_t)node.offset+0; uint32_t n1Idx = (uint32_t)node.offset+1; node_t n0 = bvh.nodes[n0Idx]; @@ -806,12 +808,14 @@ namespace cuBQL { bool o0 = rayIntersectsBox(node_t0,ray,rcp_dir,n0.bounds,dbg); bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds,dbg); - if (dbg) { - dout << " node L " << n0.bounds << "\n"; - dout << " node R " << n1.bounds << "\n"; - dout << "children L hit " << int(o0) << " dist " << node_t0 - << " R hit " << int(o1) << " dist " << node_t1 << "\n"; - } + // return; //xx + + // if (dbg) { + // dout << " node L " << n0.bounds << "\n"; + // dout << " node R " << n1.bounds << "\n"; + // dout << "children L hit " << int(o0) << " dist " << node_t0 + // << " R hit " << int(o1) << " dist " << node_t1 << "\n"; + // } if (o0) { if (o1) { if ((stackPtr-traversalStack) >= STACK_DEPTH) { @@ -838,15 +842,16 @@ namespace cuBQL { } } } - + + // return;//xx + if (node.count != 0 && blasStackBase != nullptr) { // we're at a valid leaf: call the lambda and see if that gave // us a new, closer cull radius // if (dbg) // printf("trav leaf-leaf primIDs %p offset %i count %i\n", // bvh.primIDs,(int)node.offset,(int)node.count); - ray.tMax - = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count); + ray.tMax = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count); } // ------------------------------------------------------------------ // pop next un-traversed node from stack, discarding any nodes