From 3bfb5658df07cb41fea78e0b42d2d59b5612529b Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 3 Apr 2026 18:27:42 -0600 Subject: [PATCH 1/9] added xfmBox to match xfmVector and xfmPoint etc --- cuBQL/math/affine.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h index 798d037..d4e0d4c 100644 --- a/cuBQL/math/affine.h +++ b/cuBQL/math/affine.h @@ -181,6 +181,25 @@ namespace cuBQL { const typename AffineSpaceT::vector_t& n) { return xfmNormal(m.l,n); } + template + box_t xfmBox(const AffineSpaceT>> &xfm, + const box_t &box) + { + box_t out; + if (in.empty()) return out; + auto l = in.lower; + auto u = in.upper; + out.extend(xfmPoint(xfm,vec_t(l.x,l.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(l.x,l.y,u.z))); + out.extend(xfmPoint(xfm,vec_t(l.x,u.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(l.x,u.y,u.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,l.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,l.y,u.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,u.y,l.z))); + out.extend(xfmPoint(xfm,vec_t(u.x,u.y,u.z))); + return out; + } + //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators From 0a9a1af55b91ac9660a4677199de4d73d91e894d Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 3 Apr 2026 18:31:01 -0600 Subject: [PATCH 2/9] typo fix --- cuBQL/math/affine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h index d4e0d4c..8c73416 100644 --- a/cuBQL/math/affine.h +++ b/cuBQL/math/affine.h @@ -183,7 +183,7 @@ namespace cuBQL { template box_t xfmBox(const AffineSpaceT>> &xfm, - const box_t &box) + const box_t &in) { box_t out; if (in.empty()) return out; From 33b17349318f45dec120f17695921bb231fbdad9 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Fri, 12 Jun 2026 18:04:44 -0600 Subject: [PATCH 3/9] working on hip --- cuBQL/builder/cpu.h | 1 + cuBQL/builder/cpu/spatialMedian.h | 41 ++ cuBQL/builder/cuda.h | 2 + cuBQL/builder/cuda/builder_common.h | 145 ++++--- cuBQL/builder/cuda/gpu_builder.h | 2 + cuBQL/builder/cuda/sah_builder.h | 16 +- cuBQL/builder/cuda/sm_builder.h | 400 +++++++++++++++--- cuBQL/math/box.h | 19 +- cuBQL/math/common.h | 4 +- cuBQL/math/constants.h | 2 +- cuBQL/math/math.h | 14 +- cuBQL/math/vec.h | 31 +- .../math/rayTriangleIntersections.h | 2 +- cuBQL/traversal/rayQueries.h | 37 +- 14 files changed, 545 insertions(+), 171 deletions(-) diff --git a/cuBQL/builder/cpu.h b/cuBQL/builder/cpu.h index bdf447d..9c78acd 100644 --- a/cuBQL/builder/cpu.h +++ b/cuBQL/builder/cpu.h @@ -66,6 +66,7 @@ namespace cuBQL { uint32_t numPrims, BuildConfig buildConfig) { + PING; /*! right now, only have a slow spatial median builder */ cpu::spatialMedian(bvh,boxes,numPrims,buildConfig); } diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h index b72d61e..6f128b8 100644 --- a/cuBQL/builder/cpu/spatialMedian.h +++ b/cuBQL/builder/cpu/spatialMedian.h @@ -112,11 +112,31 @@ namespace cuBQL { node.bounds = box_t() .including(bvh.nodes[node.admin.offset+0].bounds) .including(bvh.nodes[node.admin.offset+1].bounds); + if (node.bounds.empty()) { + PING; + PRINT(bvh.numNodes); + PRINT(node.admin.offset); + PRINT(node.admin.count); + PRINT(bvh.nodes[node.admin.offset+0].bounds); + PRINT(bvh.nodes[node.admin.offset+1].bounds); + + box3f a = box_t() + .including(bvh.nodes[node.admin.offset+0].bounds); + box3f b = box_t() + .including(bvh.nodes[node.admin.offset+0].bounds) + .including(bvh.nodes[node.admin.offset+1].bounds); + box3f c = box_t(); + PRINT(a); + PRINT(b); + PRINT(c); + } } else { node.bounds.clear(); for (int i=0;i @@ -127,11 +147,30 @@ namespace cuBQL { { using box_t = ::cuBQL::box_t; std::vector primIDs; + PING; PRINT(numPrims); for (int i=0;i::Node[1]; + bvh.nodes[0].bounds = box3f(); + bvh.nodes[0].admin.offset = 0; + bvh.nodes[0].admin.count = 1; + bvh.primIDs = new uint32_t[numPrims]; + for (int i=0;i altPrimIDs(primIDs.size()); std::vector topo(1); @@ -151,6 +190,8 @@ namespace cuBQL { } topo.clear(); refit(0,bvh,boxes); + PING; + PRINT(bvh.nodes[0].bounds); } } // spatialMedian_impl diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h index f15dad8..eca25ef 100644 --- a/cuBQL/builder/cuda.h +++ b/cuBQL/builder/cuda.h @@ -45,6 +45,7 @@ namespace cuBQL { DeviceMemoryResource() {} void malloc(void** ptr, size_t size, cudaStream_t s) override { + CUBQL_CUDA_CALL(StreamSynchronize(s)); CUBQL_CUDA_CALL(Malloc(ptr, size)); } void free(void* ptr, cudaStream_t s) override @@ -81,6 +82,7 @@ namespace cuBQL { } ); } void malloc(void** ptr, size_t size, cudaStream_t s) override { + PING; #ifndef NDEBUG if (s_numDevices > 1 && s == 0) std::cerr << "@cuBQL: warning; async memory allocator used with default stream." diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h index e71b901..49eac52 100644 --- a/cuBQL/builder/cuda/builder_common.h +++ b/cuBQL/builder/cuda/builder_common.h @@ -23,7 +23,7 @@ namespace cub { namespace cuBQL { namespace gpuBuilder_impl { - inline __device__ void atomic_min(int32_t *v, int32_t vv) + inline __device__ void atomic_min(volatile int32_t *v, int32_t vv) { atomicMin((int *)v,(int)vv); } inline __device__ void atomic_min(int64_t *v, int64_t vv) { atomicMin((long long *)v,(long long)vv); } @@ -32,7 +32,7 @@ namespace cuBQL { inline __device__ void atomic_min(uint64_t *v, uint64_t vv) { atomicMin((unsigned long long *)v,(unsigned long long)vv); } - inline __device__ void atomic_max(int32_t *v, int32_t vv) + inline __device__ void atomic_max(volatile int32_t *v, int32_t vv) { atomicMax((int *)v,(int)vv); } inline __device__ void atomic_max(int64_t *v, int64_t vv) { atomicMax((long long *)v,(long long)vv); } @@ -81,72 +81,86 @@ namespace cuBQL { template struct int_type_of; template<> struct int_type_of { typedef int32_t type; }; - template<> struct int_type_of { typedef int64_t type; }; - template<> struct int_type_of { typedef int32_t type; }; - template<> struct int_type_of { typedef int64_t type; }; - template<> struct int_type_of { typedef uint32_t type; }; - template<> struct int_type_of { typedef uint64_t type; }; + // template<> struct int_type_of { typedef int64_t type; }; + // template<> struct int_type_of { typedef int32_t type; }; + // template<> struct int_type_of { typedef int64_t type; }; + // template<> struct int_type_of { typedef uint32_t type; }; + // template<> struct int_type_of { typedef uint64_t type; }; - template inline __device__ - typename int_type_of::type encode(T v); + // template inline __device__ + // typename int_type_of::type encode(T v); - template<> inline __device__ - int32_t encode(float f) + // template<> + inline __device__ + int32_t encode(const float &f) { - const int32_t sign = 0x80000000; - int32_t bits = __float_as_int(f); + // int *ptr = (int *)&f; + + const int32_t sign = 0x80000000U; + // int32_t bits = __float_as_int(f); + int32_t bits;// = *ptr;//(const int &)f; + memcpy(&bits,&f,4); if (bits & sign) bits ^= 0x7fffffff; return bits; } - template<> inline __device__ - int64_t encode(double f) - { - const int64_t sign = 0x8000000000000000LL; - int64_t bits = __double_as_longlong(f); - if (bits & sign) bits ^= 0x7fffffffffffffffLL; - return bits; - } - template<> inline __device__ - int32_t encode(int32_t bits) - { - return bits; - } - template<> inline __device__ - int64_t encode(int64_t bits) - { - return bits; - } + // template<> inline __device__ + // int64_t encode(double f) + // { + // const int64_t sign = 0x8000000000000000LL; + // int64_t bits = __double_as_longlong(f); + // if (bits & sign) bits ^= 0x7fffffffffffffffLL; + // return bits; + // } + // template<> inline __device__ + // int32_t encode(int32_t bits) + // { + // return bits; + // } + // template<> inline __device__ + // int64_t encode(int64_t bits) + // { + // return bits; + // } template inline __device__ T decode(int32_t v); - template inline __device__ - T decode(uint32_t v); - template inline __device__ - T decode(int64_t v); - template inline __device__ - T decode(uint64_t v); + // template inline __device__ + // T decode(uint32_t v); + // template inline __device__ + // T decode(int64_t v); + // template inline __device__ + // T decode(uint64_t v); template<> inline __device__ float decode(int32_t bits) { const int32_t sign = 0x80000000; if (bits & sign) bits ^= 0x7fffffff; - return __int_as_float(bits); +// #ifdef __HIPCC__ +// float ff = ((const float &)bits); +// return ff; +// #endif + float f; + memcpy(&f,&bits,sizeof(4)); + return f; + // float *f = (float *)&bits; + // return *f; + // return __int_as_float(bits); } - template<> inline __device__ - int32_t decode(int32_t bits) - { return bits; } - template<> inline __device__ - int64_t decode(int64_t bits) - { return bits; } + // template<> inline __device__ + // int32_t decode(int32_t bits) + // { return bits; } + // template<> inline __device__ + // int64_t decode(int64_t bits) + // { return bits; } - template<> inline __device__ - double decode(int64_t bits) - { - const int64_t sign = 0x8000000000000000LL; - if (bits & sign) bits ^= 0x7fffffffffffffffLL; - return __longlong_as_double(bits); - } + // template<> inline __device__ + // double decode(int64_t bits) + // { + // const int64_t sign = 0x8000000000000000LL; + // if (bits & sign) bits ^= 0x7fffffffffffffffLL; + // return __longlong_as_double(bits); + // } template struct CUBQL_ALIGN(8) AtomicBox { @@ -219,29 +233,37 @@ namespace cuBQL { using scalar_t = typename box_t::scalar_t; #pragma unroll for (int d=0;d());//encode(+FLT_MAX); upper[d] = encode(empty_box_upper());//encode(-FLT_MAX); +#endif } } template inline __device__ - void atomic_grow(AtomicBox &abox, const typename box_t::vec_t &other) + void atomic_grow(AtomicBox &abox, + typename box_t::vec_t other) { using scalar_t = typename AtomicBox::scalar_t; + auto in_y = abox.upper[1]; #pragma unroll for (int d=0;d::type enc - = //AtomicBox:: - encode(other[d]);//get(other,d)); + // const typename int_type_of::type enc + const int32_t enc + = encode(other[d]); if (enc < abox.lower[d]) - atomic_min(&abox.lower[d],enc); + atomic_min((volatile int32_t*)abox.lower+d,enc); if (enc > abox.upper[d]) - atomic_max(&abox.upper[d],enc); + atomic_max((volatile int32_t*)abox.upper+d,enc); } } template - inline __device__ void atomic_grow(AtomicBox &abox, const box_t &other) + inline __device__ + void atomic_grow(AtomicBox &abox, box_t other) { using scalar_t = typename AtomicBox::scalar_t; #pragma unroll @@ -252,13 +274,14 @@ namespace cuBQL { const typename int_type_of::type enc_upper = //AtomicBox:: encode(other.get_upper(d)); - if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower); - if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper); + if (enc_lower < abox.lower[d]) atomic_min(abox.lower+d,enc_lower); + if (enc_upper > abox.upper[d]) atomic_max(abox.upper+d,enc_upper); } } template - inline __device__ void atomic_grow(AtomicBox &abox, const AtomicBox &other) + inline __device__ void atomic_grow(AtomicBox &abox, + const AtomicBox &other) { using scalar_t = typename AtomicBox::scalar_t; #pragma unroll diff --git a/cuBQL/builder/cuda/gpu_builder.h b/cuBQL/builder/cuda/gpu_builder.h index 18365f7..9b396dc 100644 --- a/cuBQL/builder/cuda/gpu_builder.h +++ b/cuBQL/builder/cuda/gpu_builder.h @@ -52,7 +52,9 @@ namespace cuBQL { buildConfig.makeLeafThreshold = 1; gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource); } + PING; cuBQL::cuda::refit(bvh,boxes,s,memResource); + PING; } namespace cuda { diff --git a/cuBQL/builder/cuda/sah_builder.h b/cuBQL/builder/cuda/sah_builder.h index 423d6fb..4ba5277 100644 --- a/cuBQL/builder/cuda/sah_builder.h +++ b/cuBQL/builder/cuda/sah_builder.h @@ -554,14 +554,14 @@ namespace cuBQL { GpuMemoryResource& memResource) { real3_sahBuilder(bvh,boxes,_numPrims,buildConfig,s,memResource); } - template<> - inline void sahBuilder(BinaryBVH &bvh, - const box_t *boxes, - uint32_t _numPrims, - BuildConfig buildConfig, - cudaStream_t s, - GpuMemoryResource& memResource) - { real3_sahBuilder(bvh,boxes,_numPrims,buildConfig,s,memResource); } + // template<> + // inline void sahBuilder(BinaryBVH &bvh, + // const box_t *boxes, + // uint32_t _numPrims, + // BuildConfig buildConfig, + // cudaStream_t s, + // GpuMemoryResource& memResource) + // { real3_sahBuilder(bvh,boxes,_numPrims,buildConfig,s,memResource); } } // ::cuBQL::sahBuilder_impl namespace cuda { diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h index 59395cd..c3cc42c 100644 --- a/cuBQL/builder/cuda/sm_builder.h +++ b/cuBQL/builder/cuda/sm_builder.h @@ -25,13 +25,14 @@ namespace cuBQL { }; template - struct CUBQL_ALIGN(16) TempNode { + struct// CUBQL_ALIGN(16) + TempNode { using box_t = cuBQL::box_t; - union { + // union { struct { - AtomicBox centBounds; uint32_t count; uint32_t unused; + AtomicBox centBounds; } openBranch; struct { uint32_t offset; @@ -42,9 +43,8 @@ namespace cuBQL { struct { uint32_t offset; uint32_t count; - uint32_t unused[2]; } doneNode; - }; + // }; }; template @@ -53,11 +53,15 @@ namespace cuBQL { NodeState *nodeStates, TempNode *nodes) { + if (threadIdx.x > 0) return; + buildState->numNodes = 2; nodeStates[0] = OPEN_BRANCH; nodes[0].openBranch.count = 0; nodes[0].openBranch.centBounds.set_empty(); + if (nodes[0].openBranch.centBounds.upper[0] > 0) + nodes[0].openBranch.centBounds.upper[0] = 8888; nodeStates[1] = DONE_NODE; nodes[1].doneNode.offset = 0; @@ -82,7 +86,9 @@ namespace cuBQL { me.done = false; // this could be made faster by block-reducing ... atomicAdd(&nodes[0].openBranch.count,1); - atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box)); + auto ctr = box.center(); + atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box)); + // atomic_grow(nodes[0].openBranch.centBounds,centerOf(box)); } else { me.nodeID = (uint32_t)-1; me.done = true; @@ -97,7 +103,7 @@ namespace cuBQL { uint32_t numNodes, BuildConfig buildConfig) { -#if 1 +#if 0 __shared__ int l_newNodeOfs; if (threadIdx.x == 0) l_newNodeOfs = 0; @@ -208,13 +214,14 @@ namespace cuBQL { auto in = nodes[nodeID].openBranch; if (in.count <= buildConfig.makeLeafThreshold) { - auto &done = nodes[nodeID].doneNode; + auto done = nodes[nodeID].doneNode; done.count = in.count; // set this to max-value, so the prims can later do atomicMin // with their position ion the leaf list; this value is // greater than any prim position. done.offset = (uint32_t)-1; nodeState = DONE_NODE; + nodes[nodeID].doneNode = done; } else { float widestWidth = 0.f; int widestDim = -1; @@ -235,7 +242,8 @@ namespace cuBQL { widestCtr = ctr; } - auto &open = nodes[nodeID].openNode; + // auto &open = nodes[nodeID].openNode; + auto open = nodes[nodeID].openNode; if (widestDim >= 0) { open.pos = widestCtr; } @@ -249,12 +257,15 @@ namespace cuBQL { #pragma unroll for (int side=0;side<2;side++) { const int childID = open.offset+side; - auto &child = nodes[childID].openBranch; - child.centBounds.set_empty(); - child.count = 0; + TempNode child; + // auto &child = nodes[childID].openBranch; + child.openBranch.centBounds.set_empty(); + child.openBranch.count = 0; nodeStates[childID] = OPEN_BRANCH; + nodes[childID] = child; } nodeState = OPEN_NODE; + nodes[nodeID].openNode = open; } #endif } @@ -420,6 +431,8 @@ namespace cuBQL { GpuMemoryResource &memResource) { assert(sizeof(PrimState) == sizeof(uint64_t)); + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); // ================================================================== // do build on temp nodes @@ -432,13 +445,168 @@ namespace cuBQL { _ALLOC(nodeStates,2*numPrims,s,memResource); _ALLOC(primStates,numPrims,s,memResource); _ALLOC(buildState,1,s,memResource); - initState<<<1,1,0,s>>>(buildState, - nodeStates, - tempNodes); - initPrims<<>> + +#if 1 + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); + { + int numNodes = 1; + std::cout << "*********** INPUT BOUNDS ***********" << std::endl; + std::vector h_bbs(numPrims); + CUBQL_CUDA_CALL(Memcpy(h_bbs.data(),boxes, + numPrims*sizeof(*boxes),cudaMemcpyDefault)); + for (int i=0;i 1000000) + PRINT(bb); + } + } +#endif + + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); + initState<<<1,128,0,s>>>(buildState, + nodeStates, + tempNodes); + + + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); +#if 0 + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); + { + int numNodes = 1; + std::cout << "*********** AFTER INITSTATE ***********" << std::endl; + std::vector h_nodeStates(numNodes); + std::vector> h_tempNodes(numNodes); + CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, + numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); +CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, + numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); + for (int i=0;i float { + const int32_t sign = 0x80000000; + if (bits & sign) bits ^= 0x7fffffff; + float f; + memcpy(&f,&bits,4); + return f; + }; + auto _decode2 = [](int32_t bits) -> float { + const int32_t sign = 0x80000000; + if (bits & sign) bits ^= 0x7fffffff; + return (const float &)bits; + }; + auto _encode = [](float f) -> int32_t { + int *ptr = (int *)&f; + const int32_t sign = 0x80000000U; + // int32_t bits = __float_as_int(f); + int32_t bits = *ptr;//(const int &)f; + if (bits & sign) bits ^= 0x7fffffff; + return bits; + }; + auto &bb = n.openBranch.centBounds; + float tf = +1e8f; + auto etf = _encode(tf); + auto detf = _decode(etf); + auto detf2 = _decode2(etf); + printf(" in f %f enc %i/0x%x dec %f / %f\n", + tf,etf,etf,detf,detf2); + printf(" [%i] OPENBRANCH cnt %i bb %i %i %i: %i %i %i -> %f %f %f : %f %f %f\n", + i, + n.openBranch.count, + (bb.lower[0]), + (bb.lower[1]), + (bb.lower[2]), + (bb.upper[0]), + (bb.upper[1]), + (bb.upper[2]), + _decode(bb.lower[0]), + _decode(bb.lower[1]), + _decode(bb.lower[2]), + _decode(bb.upper[0]), + _decode(bb.upper[1]), + _decode(bb.upper[2])); + } break; + case DONE_NODE: + break; + } + } + } +#endif + + + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); + initPrims<<>> (tempNodes, primStates,boxes,numPrims); + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); + +#if 0 + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + CUBQL_CUDA_SYNC_CHECK(); + { + int numNodes = 1; + std::cout << "*********** INITIAL BOUNDS ***********" << std::endl; + std::vector h_nodeStates(numNodes); + std::vector> h_tempNodes(numNodes); + CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, + numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); +CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, + numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); + for (int i=0;i %f %f %f : %f %f %f\n", + i, + n.openBranch.count, + (bb.lower[0]), + (bb.lower[1]), + (bb.lower[2]), + (bb.upper[0]), + (bb.upper[1]), + (bb.upper[2]), + decode(bb.lower[0]), + decode(bb.lower[1]), + decode(bb.lower[2]), + decode(bb.upper[0]), + decode(bb.upper[1]), + decode(bb.upper[2])); + } break; + case DONE_NODE: + break; + } + } + } +#endif + + int numDone = 0; int numNodes; @@ -447,60 +615,168 @@ namespace cuBQL { CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent)); -#if CUBQL_PROFILE - int pass = 0; - static Profile t_writeNodes; - static Profile t_writePrims; - static Profile t_sortPrims; - static Profile t_nodePass[100]; - static Profile t_primPass[100]; - if (t_writeNodes.name == "") { - t_writeNodes.setName("writeNodes"); - t_writePrims.setName("writePrims"); - t_sortPrims.setName("sortPrims"); - for (int i=0;i<100;i++) { - t_nodePass[i].setName("nodePass",i); - t_primPass[i].setName("primPass",i); - } - } -#endif +// #if CUBQL_PROFILE +// int pass = 0; +// static Profile t_writeNodes; +// static Profile t_writePrims; +// static Profile t_sortPrims; +// static Profile t_nodePass[100]; +// static Profile t_primPass[100]; +// if (t_writeNodes.name == "") { +// t_writeNodes.setName("writeNodes"); +// t_writePrims.setName("writePrims"); +// t_sortPrims.setName("sortPrims"); +// for (int i=0;i<100;i++) { +// t_nodePass[i].setName("nodePass",i); +// t_primPass[i].setName("primPass",i); +// } +// } +// #endif + PING; while (true) { + PING; + PRINT(numDone); + int pre_numNodes = numNodes; CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes, sizeof(numNodes),cudaMemcpyDeviceToHost,s)); - if (numNodes == numDone) - break; CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); -#if CUBQL_PROFILE - t_nodePass[pass].sync_start(); + PRINT(numNodes); + CUBQL_CUDA_SYNC_CHECK(); + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + if (numNodes == numDone) + break; + PRINT(numNodes-numDone); + bool dbg = (numNodes-numDone) == 8098; +// #if CUBQL_PROFILE +// t_nodePass[pass].sync_start(); +// #endif + CUBQL_CUDA_SYNC_CHECK(); + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + +#if 0 + { + std::cout << "*********** PRE SELECT ***********" << std::endl; + std::vector h_nodeStates(numNodes); + std::vector> h_tempNodes(numNodes); + CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, + numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); + CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, + numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); + for (int i=0;i>> (buildState, nodeStates,tempNodes,numNodes, buildConfig); -#if CUBQL_PROFILE - t_nodePass[pass].sync_stop(); - t_primPass[pass].sync_start(); + CUBQL_CUDA_SYNC_CHECK(); + CUBQL_CUDA_SYNC_CHECK_STREAM(s); + +#if 0 + { + std::vector h_nodeStates(numNodes); + std::vector> h_tempNodes(numNodes); +CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, + numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); +CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, + numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); + for (int i=0;i centBounds; + // uint32_t count; + // uint32_t unused; + // } openBranch; + // struct { + // uint32_t offset; + // int dim; + // uint32_t tieBreaker; + // float pos; + // } openNode; + // struct { + // uint32_t offset; + // uint32_t count; + // uint32_t unused[2]; + // } doneNode; + } + } #endif + +// #if CUBQL_PROFILE +// t_nodePass[pass].sync_stop(); +// t_primPass[pass].sync_start(); +// #endif + numDone = numNodes; // #if 1 - if (sizeof(T)*D <= sizeof(float3)) { + if (0 && sizeof(T)*D <= sizeof(float3)) { updatePrims_shm<<>> (nodeStates,tempNodes, primStates,boxes,numPrims,numDone); } else // #else - updatePrims<<>> + updatePrims<<>> (nodeStates,tempNodes, primStates,boxes,numPrims); // #endif + + CUBQL_CUDA_SYNC_CHECK(); + CUBQL_CUDA_SYNC_CHECK_STREAM(s); -#if CUBQL_PROFILE - t_primPass[pass].sync_stop(); - ++ pass; -#endif +// #if CUBQL_PROFILE +// t_primPass[pass].sync_stop(); +// ++ pass; +// #endif } + PING; CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent)); // ================================================================== // sort {item,nodeID} list @@ -510,9 +786,10 @@ namespace cuBQL { uint8_t *d_temp_storage = NULL; size_t temp_storage_bytes = 0; PrimState *sortedPrimStates = 0; -#if CUBQL_PROFILE - t_sortPrims.sync_start(); -#endif +// #if CUBQL_PROFILE +// t_sortPrims.sync_start(); +// #endif + PING; _ALLOC(sortedPrimStates,numPrims,s,memResource); auto rc = cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, @@ -527,23 +804,25 @@ namespace cuBQL { numPrims,32,64,s); rc = rc; _FREE(d_temp_storage,s,memResource); -#if CUBQL_PROFILE - t_sortPrims.sync_stop(); - t_writePrims.sync_start(); -#endif +// #if CUBQL_PROFILE +// t_sortPrims.sync_stop(); +// t_writePrims.sync_start(); +// #endif // ================================================================== // allocate and write BVH item list, and write offsets of leaf nodes // ================================================================== + PING; bvh.numPrims = numPrims; _ALLOC(bvh.primIDs,numPrims,s,memResource); writePrimsAndLeafOffsets<<>> (tempNodes,bvh.primIDs,sortedPrimStates,numPrims); -#if CUBQL_PROFILE - t_writePrims.sync_stop(); - t_writeNodes.sync_start(); -#endif +// #if CUBQL_PROFILE +// t_writePrims.sync_stop(); +// t_writeNodes.sync_start(); +// #endif + PING; // ================================================================== // allocate and write final nodes // ================================================================== @@ -551,14 +830,15 @@ namespace cuBQL { _ALLOC(bvh.nodes,numNodes,s,memResource); writeNodes<<>> (bvh.nodes,tempNodes,numNodes); -#if CUBQL_PROFILE - t_writeNodes.sync_stop(); -#endif +// #if CUBQL_PROFILE +// t_writeNodes.sync_stop(); +// #endif _FREE(sortedPrimStates,s,memResource); _FREE(tempNodes,s,memResource); _FREE(nodeStates,s,memResource); _FREE(primStates,s,memResource); _FREE(buildState,s,memResource); + PING; } } // ::cuBQL::gpuBuilder_impl diff --git a/cuBQL/math/box.h b/cuBQL/math/box.h index 07736c4..a26f49f 100644 --- a/cuBQL/math/box.h +++ b/cuBQL/math/box.h @@ -79,7 +79,8 @@ namespace cuBQL { /*! create a box from two points. note this will NOT make sure that a &a, const vec_t_data &b) + inline __cubql_both box_t(const vec_t_data &a, + const vec_t_data &b) { lower = vec_t(a); upper = vec_t(b); @@ -88,11 +89,23 @@ namespace cuBQL { /*! returns a box that bounds both 'this' and another point 'v'; this does not get modified */ inline __cubql_both box_t including(const vec_t &v) const - { return box_t{min(lower,v),max(upper,v)}; } + { + // return box_t(min(lower,v),max(upper,v)); + box_t b; + b.lower = min(this->lower,v); + b.upper = max(this->upper,v); + return b; + } /*! returns a box that bounds both 'this' and another box 'b'; this does not get modified */ inline __cubql_both box_t including(const box_t &b) const - { return box_t{min(lower,b.lower),max(upper,b.upper)}; } + { + box_t bb; + bb.lower = min(this->lower,b.lower); + bb.upper = max(this->upper,b.upper); + return bb; + // return box_t(min(lower,b.lower),max(upper,b.upper)); + } inline __cubql_both box_t &grow(const vec_t &v) { lower = min(lower,v); upper = max(upper,v); return *this; } diff --git a/cuBQL/math/common.h b/cuBQL/math/common.h index a3063dd..00a1b68 100644 --- a/cuBQL/math/common.h +++ b/cuBQL/math/common.h @@ -336,8 +336,8 @@ namespace cuBQL { #define CUBQL_CUDA_SYNC_CHECK() \ { \ - cudaDeviceSynchronize(); \ - cudaError_t rc = cudaGetLastError(); \ + cudaError_t rc = cudaDeviceSynchronize(); \ + rc = cudaGetLastError(); \ if (rc != cudaSuccess) { \ fprintf(stderr, "error (%s: line %d): %s\n", \ __FILE__, __LINE__, cudaGetErrorString(rc)); \ diff --git a/cuBQL/math/constants.h b/cuBQL/math/constants.h index 4b5c3c4..488ad7f 100644 --- a/cuBQL/math/constants.h +++ b/cuBQL/math/constants.h @@ -6,7 +6,7 @@ #include #include #if defined(__CUDACC__) && !defined(CUDART_INF_F) -#include +// #include #endif #ifndef M_PI diff --git a/cuBQL/math/math.h b/cuBQL/math/math.h index 6943b75..86855fb 100644 --- a/cuBQL/math/math.h +++ b/cuBQL/math/math.h @@ -4,14 +4,15 @@ #pragma once #include "cuBQL/math/common.h" -#ifdef __CUDACC__ -#include -#endif +// #ifdef __CUDACC__ +// #include +// #endif #include namespace cuBQL { -#ifdef __CUDACC__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// #if defined(__CUDACC__) || defined(__HIPCC__) // make sure we use the built-in cuda functoins that use floats, not // the c-stdlib ones that use doubles. using ::min; @@ -21,14 +22,15 @@ namespace cuBQL { using std::max; #endif -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// #if defined(__CUDA_ARCH__) || defined(__HIPCC__) # define CUBQL_INF CUDART_INF_F // # define CUBQL_INF ::cuda::std::numeric_limits::infinity() #else # define CUBQL_INF std::numeric_limits::infinity() #endif -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) #else inline __cubql_both float __int_as_float(int i) { return (const float &)i; } inline __cubql_both int __float_as_int(float f) { return (const int &)f; } diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h index 9f7a9b4..3048657 100644 --- a/cuBQL/math/vec.h +++ b/cuBQL/math/vec.h @@ -7,7 +7,7 @@ #include #include #include "constants.h" -#ifdef __CUDACC__ +#if defined(__CUDACC__) && !defined(CUDART_INF) # include #endif @@ -19,10 +19,10 @@ namespace cuBQL { -#ifndef __CUDACC__ - using std::min; - using std::max; -#endif +// #ifndef __CUDA_ARCH__ +// using std::min; +// using std::max; +// #endif #ifndef CUBQL_SUPPORT_CUDA_VECTOR_TYPES #define CUBQL_SUPPORT_CUDA_VECTOR_TYPES 0 @@ -46,7 +46,8 @@ namespace cuBQL { equivalent, let's also create a 'invalid_t' to be used by default */ -#ifndef __CUDACC__ +#if defined(__CUDACC__) || defined(__HIPCC__) +#else struct float2 { float x, y; }; struct float3 { float x, y, z; }; struct CUBQL_ALIGN(16) float4 { float x, y, z, w; }; @@ -492,18 +493,21 @@ namespace cuBQL { { return vec_t( v.x >> b, v.y >> b, v.z >> b, v.w >> b ); } inline __cubql_both double abs(double d) { -#ifdef __CUDA_ARCH__ +// #ifdef __CUDA_ARCH__ return ::abs(d); -#else - return std::abs(d); -#endif +// #else +// return std::abs(d); +// #endif } inline __cubql_both float abs(float d) { -#ifdef __CUDA_ARCH__ - return ::abs(d); +#if defined(__CUDA_ARCH__) || defined(__CUDACC__) + return abs(d); #else return std::abs(d); #endif +// #else +// return std::abs(d); +// #endif } // inline __cubql_both double abs(double d) { return absf(d); } @@ -711,7 +715,8 @@ namespace cuBQL { template inline __cubql_both - vec_t normalize(vec_t v) { return v * (T(1)/sqrt(dot(v,v))); } + vec_t normalize(vec_t v) { return v * (T(1)/::sqrt(dot(v,v))); } + // vec_t normalize(vec_t v) { return v * (T(1)/sqrt(dot(v,v))); } // ------------------------------------------------------------------ diff --git a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h index bc20150..44a7683 100644 --- a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h +++ b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h @@ -40,7 +40,7 @@ namespace cuBQL { #else auto dbg = _dbg; #endif - if (dbg) /* nothing, just for 'maybe_unused' */; + if (dbg) {/* nothing, just for 'maybe_unused' */}; using vec3 = vec_t; const vec3 v0(tri.a); diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h index 838c6aa..78a8011 100644 --- a/cuBQL/traversal/rayQueries.h +++ b/cuBQL/traversal/rayQueries.h @@ -716,11 +716,11 @@ namespace cuBQL { ray_t &ray, bool _dbg) { -#ifdef NDEBUG +// #ifdef NDEBUG const bool dbg = false; -#else - bool dbg = _dbg; -#endif +// #else +// bool dbg = _dbg; +// #endif using bvh_t = BinaryBVH; using node_t = typename bvh_t::node_t; using scalar_t = typename bvh_t::scalar_t; @@ -746,7 +746,7 @@ namespace cuBQL { if (ray.direction.y == (scalar_t)0) ray.direction.y = scalar_t(1e-20); if (ray.direction.z == (scalar_t)0) ray.direction.z = scalar_t(1e-20); vec3_t rcp_dir = rcp(ray.direction); - + // ------------------------------------------------------------------ // traverse until there's nothing left to traverse: // ------------------------------------------------------------------ @@ -761,6 +761,7 @@ namespace cuBQL { while (true) { // if (dbg) printf("node %i.%i\n",(int)node.offset,(int)node.count); if (node.count != 0) { + // return;//xx // it's a boy! - seriously: this is not a inner node; so // we're either at a final leaf, or at an instance node if (blasStackBase != nullptr) @@ -771,8 +772,8 @@ namespace cuBQL { tlasSavedNodePtr = bvh.nodes; tlasSavedPrimIDs = bvh.primIDs; #ifndef NDEBUG - if (node.count != 1) - printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n"); + // if (node.count != 1) + // printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n"); #endif // if (dbg) // printf("inner-leaf primIDs %p ofs %i count %i\n", @@ -809,6 +810,7 @@ namespace cuBQL { // if (dbg) printf("new node %i.%i\n",(int)node.offset,(int)node.count); } + uint32_t n0Idx = (uint32_t)node.offset+0; uint32_t n1Idx = (uint32_t)node.offset+1; node_t n0 = bvh.nodes[n0Idx]; @@ -817,12 +819,14 @@ namespace cuBQL { bool o0 = rayIntersectsBox(node_t0,ray,rcp_dir,n0.bounds,dbg); bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds,dbg); - if (dbg) { - dout << " node L " << n0.bounds << "\n"; - dout << " node R " << n1.bounds << "\n"; - dout << "children L hit " << int(o0) << " dist " << node_t0 - << " R hit " << int(o1) << " dist " << node_t1 << "\n"; - } + // return; //xx + + // if (dbg) { + // dout << " node L " << n0.bounds << "\n"; + // dout << " node R " << n1.bounds << "\n"; + // dout << "children L hit " << int(o0) << " dist " << node_t0 + // << " R hit " << int(o1) << " dist " << node_t1 << "\n"; + // } if (o0) { if (o1) { if ((stackPtr-traversalStack) >= STACK_DEPTH) { @@ -849,15 +853,16 @@ namespace cuBQL { } } } - + + // return;//xx + if (node.count != 0 && blasStackBase != nullptr) { // we're at a valid leaf: call the lambda and see if that gave // us a new, closer cull radius // if (dbg) // printf("trav leaf-leaf primIDs %p offset %i count %i\n", // bvh.primIDs,(int)node.offset,(int)node.count); - ray.tMax - = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count); + ray.tMax = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count); } // ------------------------------------------------------------------ // pop next un-traversed node from stack, discarding any nodes From eceb7326384ef7f049455d314480dbd8b5a5622c Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 14 Jun 2026 13:29:55 -0600 Subject: [PATCH 4/9] cleanups --- cuBQL/builder/cpu.h | 1 - cuBQL/builder/cpu/spatialMedian.h | 28 +-- cuBQL/builder/cuda.h | 6 +- cuBQL/builder/cuda/builder_common.h | 122 +++++----- cuBQL/builder/cuda/gpu_builder.h | 2 - cuBQL/builder/cuda/sm_builder.h | 335 +--------------------------- 6 files changed, 66 insertions(+), 428 deletions(-) diff --git a/cuBQL/builder/cpu.h b/cuBQL/builder/cpu.h index 9c78acd..bdf447d 100644 --- a/cuBQL/builder/cpu.h +++ b/cuBQL/builder/cpu.h @@ -66,7 +66,6 @@ namespace cuBQL { uint32_t numPrims, BuildConfig buildConfig) { - PING; /*! right now, only have a slow spatial median builder */ cpu::spatialMedian(bvh,boxes,numPrims,buildConfig); } diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h index 6f128b8..ace5e24 100644 --- a/cuBQL/builder/cpu/spatialMedian.h +++ b/cuBQL/builder/cpu/spatialMedian.h @@ -112,31 +112,11 @@ namespace cuBQL { node.bounds = box_t() .including(bvh.nodes[node.admin.offset+0].bounds) .including(bvh.nodes[node.admin.offset+1].bounds); - if (node.bounds.empty()) { - PING; - PRINT(bvh.numNodes); - PRINT(node.admin.offset); - PRINT(node.admin.count); - PRINT(bvh.nodes[node.admin.offset+0].bounds); - PRINT(bvh.nodes[node.admin.offset+1].bounds); - - box3f a = box_t() - .including(bvh.nodes[node.admin.offset+0].bounds); - box3f b = box_t() - .including(bvh.nodes[node.admin.offset+0].bounds) - .including(bvh.nodes[node.admin.offset+1].bounds); - box3f c = box_t(); - PRINT(a); - PRINT(b); - PRINT(c); - } } else { node.bounds.clear(); for (int i=0;i @@ -147,16 +127,14 @@ namespace cuBQL { { using box_t = ::cuBQL::box_t; std::vector primIDs; - PING; PRINT(numPrims); for (int i=0;i::Node[1]; bvh.nodes[0].bounds = box3f(); bvh.nodes[0].admin.offset = 0; @@ -166,11 +144,9 @@ namespace cuBQL { bvh.primIDs[i] = i; bvh.numPrims = numPrims; bvh.numNodes = 1; - PING; return; } - PRINT(primIDs.size()); std::vector altPrimIDs(primIDs.size()); std::vector topo(1); @@ -190,8 +166,6 @@ namespace cuBQL { } topo.clear(); refit(0,bvh,boxes); - PING; - PRINT(bvh.nodes[0].bounds); } } // spatialMedian_impl diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h index eca25ef..061091b 100644 --- a/cuBQL/builder/cuda.h +++ b/cuBQL/builder/cuda.h @@ -77,12 +77,14 @@ namespace cuBQL { cudaMemPool_t mempool; CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice)); uint64_t threshold = UINT64_MAX; - CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold)); + CUBQL_CUDA_CALL + (MemPoolSetAttribute(mempool, + cudaMemPoolAttrReleaseThreshold, + &threshold)); } } ); } void malloc(void** ptr, size_t size, cudaStream_t s) override { - PING; #ifndef NDEBUG if (s_numDevices > 1 && s == 0) std::cerr << "@cuBQL: warning; async memory allocator used with default stream." diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h index 49eac52..3e080c7 100644 --- a/cuBQL/builder/cuda/builder_common.h +++ b/cuBQL/builder/cuda/builder_common.h @@ -81,86 +81,75 @@ namespace cuBQL { template struct int_type_of; template<> struct int_type_of { typedef int32_t type; }; - // template<> struct int_type_of { typedef int64_t type; }; - // template<> struct int_type_of { typedef int32_t type; }; - // template<> struct int_type_of { typedef int64_t type; }; - // template<> struct int_type_of { typedef uint32_t type; }; - // template<> struct int_type_of { typedef uint64_t type; }; + template<> struct int_type_of { typedef int64_t type; }; + template<> struct int_type_of { typedef int32_t type; }; + template<> struct int_type_of { typedef int64_t type; }; + template<> struct int_type_of { typedef uint32_t type; }; + template<> struct int_type_of { typedef uint64_t type; }; - // template inline __device__ - // typename int_type_of::type encode(T v); + template inline __device__ + typename int_type_of::type encode(T v); - // template<> - inline __device__ - int32_t encode(const float &f) + template<> inline __device__ + int32_t encode(float f) { - // int *ptr = (int *)&f; - const int32_t sign = 0x80000000U; - // int32_t bits = __float_as_int(f); - int32_t bits;// = *ptr;//(const int &)f; - memcpy(&bits,&f,4); + int32_t bits = __float_as_int(f); if (bits & sign) bits ^= 0x7fffffff; return bits; } - // template<> inline __device__ - // int64_t encode(double f) - // { - // const int64_t sign = 0x8000000000000000LL; - // int64_t bits = __double_as_longlong(f); - // if (bits & sign) bits ^= 0x7fffffffffffffffLL; - // return bits; - // } - // template<> inline __device__ - // int32_t encode(int32_t bits) - // { - // return bits; - // } - // template<> inline __device__ - // int64_t encode(int64_t bits) - // { - // return bits; - // } + + template<> inline __device__ + int64_t encode(double f) + { + const int64_t sign = 0x8000000000000000LL; + int64_t bits = __double_as_longlong(f); + if (bits & sign) bits ^= 0x7fffffffffffffffLL; + return bits; + } + template<> inline __device__ + int32_t encode(int32_t bits) + { + return bits; + } + template<> inline __device__ + int64_t encode(int64_t bits) + { + return bits; + } template inline __device__ T decode(int32_t v); - // template inline __device__ - // T decode(uint32_t v); - // template inline __device__ - // T decode(int64_t v); - // template inline __device__ - // T decode(uint64_t v); + template inline __device__ + T decode(uint32_t v); + template inline __device__ + T decode(int64_t v); + template inline __device__ + T decode(uint64_t v); template<> inline __device__ float decode(int32_t bits) { const int32_t sign = 0x80000000; if (bits & sign) bits ^= 0x7fffffff; -// #ifdef __HIPCC__ -// float ff = ((const float &)bits); -// return ff; -// #endif - float f; - memcpy(&f,&bits,sizeof(4)); - return f; - // float *f = (float *)&bits; - // return *f; - // return __int_as_float(bits); + return __int_as_float(bits); } - // template<> inline __device__ - // int32_t decode(int32_t bits) - // { return bits; } - // template<> inline __device__ - // int64_t decode(int64_t bits) - // { return bits; } + + template<> inline __device__ + int32_t decode(int32_t bits) + { return bits; } + + template<> inline __device__ + int64_t decode(int64_t bits) + { return bits; } - // template<> inline __device__ - // double decode(int64_t bits) - // { - // const int64_t sign = 0x8000000000000000LL; - // if (bits & sign) bits ^= 0x7fffffffffffffffLL; - // return __longlong_as_double(bits); - // } + template<> inline __device__ + double decode(int64_t bits) + { + const int64_t sign = 0x8000000000000000LL; + if (bits & sign) bits ^= 0x7fffffffffffffffLL; + return __longlong_as_double(bits); + } template struct CUBQL_ALIGN(8) AtomicBox { @@ -233,13 +222,8 @@ namespace cuBQL { using scalar_t = typename box_t::scalar_t; #pragma unroll for (int d=0;d());//encode(+FLT_MAX); upper[d] = encode(empty_box_upper());//encode(-FLT_MAX); -#endif } } @@ -248,11 +232,9 @@ namespace cuBQL { typename box_t::vec_t other) { using scalar_t = typename AtomicBox::scalar_t; - auto in_y = abox.upper[1]; #pragma unroll for (int d=0;d::type enc - const int32_t enc + const typename int_type_of::type enc = encode(other[d]); if (enc < abox.lower[d]) atomic_min((volatile int32_t*)abox.lower+d,enc); diff --git a/cuBQL/builder/cuda/gpu_builder.h b/cuBQL/builder/cuda/gpu_builder.h index 9b396dc..18365f7 100644 --- a/cuBQL/builder/cuda/gpu_builder.h +++ b/cuBQL/builder/cuda/gpu_builder.h @@ -52,9 +52,7 @@ namespace cuBQL { buildConfig.makeLeafThreshold = 1; gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource); } - PING; cuBQL::cuda::refit(bvh,boxes,s,memResource); - PING; } namespace cuda { diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h index c3cc42c..30ef82c 100644 --- a/cuBQL/builder/cuda/sm_builder.h +++ b/cuBQL/builder/cuda/sm_builder.h @@ -25,10 +25,10 @@ namespace cuBQL { }; template - struct// CUBQL_ALIGN(16) + struct CUBQL_ALIGN(16) TempNode { using box_t = cuBQL::box_t; - // union { + union { struct { uint32_t count; uint32_t unused; @@ -44,7 +44,7 @@ namespace cuBQL { uint32_t offset; uint32_t count; } doneNode; - // }; + }; }; template @@ -53,15 +53,11 @@ namespace cuBQL { NodeState *nodeStates, TempNode *nodes) { - if (threadIdx.x > 0) return; - buildState->numNodes = 2; nodeStates[0] = OPEN_BRANCH; nodes[0].openBranch.count = 0; nodes[0].openBranch.centBounds.set_empty(); - if (nodes[0].openBranch.centBounds.upper[0] > 0) - nodes[0].openBranch.centBounds.upper[0] = 8888; nodeStates[1] = DONE_NODE; nodes[1].doneNode.offset = 0; @@ -87,8 +83,7 @@ namespace cuBQL { // this could be made faster by block-reducing ... atomicAdd(&nodes[0].openBranch.count,1); auto ctr = box.center(); - atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box)); - // atomic_grow(nodes[0].openBranch.centBounds,centerOf(box)); + atomic_grow(nodes[0].openBranch.centBounds,ctr); } else { me.nodeID = (uint32_t)-1; me.done = true; @@ -103,7 +98,7 @@ namespace cuBQL { uint32_t numNodes, BuildConfig buildConfig) { -#if 0 +#if 1 __shared__ int l_newNodeOfs; if (threadIdx.x == 0) l_newNodeOfs = 0; @@ -431,9 +426,6 @@ namespace cuBQL { GpuMemoryResource &memResource) { assert(sizeof(PrimState) == sizeof(uint64_t)); - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); - // ================================================================== // do build on temp nodes // ================================================================== @@ -446,337 +438,46 @@ namespace cuBQL { _ALLOC(primStates,numPrims,s,memResource); _ALLOC(buildState,1,s,memResource); -#if 1 - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); - { - int numNodes = 1; - std::cout << "*********** INPUT BOUNDS ***********" << std::endl; - std::vector h_bbs(numPrims); - CUBQL_CUDA_CALL(Memcpy(h_bbs.data(),boxes, - numPrims*sizeof(*boxes),cudaMemcpyDefault)); - for (int i=0;i 1000000) - PRINT(bb); - } - } -#endif + initState<<<1,128,0,s>>>(buildState, + nodeStates, + tempNodes); - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); - initState<<<1,128,0,s>>>(buildState, - nodeStates, - tempNodes); - - - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); -#if 0 - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); - { - int numNodes = 1; - std::cout << "*********** AFTER INITSTATE ***********" << std::endl; - std::vector h_nodeStates(numNodes); - std::vector> h_tempNodes(numNodes); - CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, - numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); -CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, - numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); - for (int i=0;i float { - const int32_t sign = 0x80000000; - if (bits & sign) bits ^= 0x7fffffff; - float f; - memcpy(&f,&bits,4); - return f; - }; - auto _decode2 = [](int32_t bits) -> float { - const int32_t sign = 0x80000000; - if (bits & sign) bits ^= 0x7fffffff; - return (const float &)bits; - }; - auto _encode = [](float f) -> int32_t { - int *ptr = (int *)&f; - const int32_t sign = 0x80000000U; - // int32_t bits = __float_as_int(f); - int32_t bits = *ptr;//(const int &)f; - if (bits & sign) bits ^= 0x7fffffff; - return bits; - }; - auto &bb = n.openBranch.centBounds; - float tf = +1e8f; - auto etf = _encode(tf); - auto detf = _decode(etf); - auto detf2 = _decode2(etf); - printf(" in f %f enc %i/0x%x dec %f / %f\n", - tf,etf,etf,detf,detf2); - printf(" [%i] OPENBRANCH cnt %i bb %i %i %i: %i %i %i -> %f %f %f : %f %f %f\n", - i, - n.openBranch.count, - (bb.lower[0]), - (bb.lower[1]), - (bb.lower[2]), - (bb.upper[0]), - (bb.upper[1]), - (bb.upper[2]), - _decode(bb.lower[0]), - _decode(bb.lower[1]), - _decode(bb.lower[2]), - _decode(bb.upper[0]), - _decode(bb.upper[1]), - _decode(bb.upper[2])); - } break; - case DONE_NODE: - break; - } - } - } -#endif - - - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); initPrims<<>> (tempNodes, primStates,boxes,numPrims); - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); - - -#if 0 - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - CUBQL_CUDA_SYNC_CHECK(); - { - int numNodes = 1; - std::cout << "*********** INITIAL BOUNDS ***********" << std::endl; - std::vector h_nodeStates(numNodes); - std::vector> h_tempNodes(numNodes); - CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, - numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); -CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, - numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); - for (int i=0;i %f %f %f : %f %f %f\n", - i, - n.openBranch.count, - (bb.lower[0]), - (bb.lower[1]), - (bb.lower[2]), - (bb.upper[0]), - (bb.upper[1]), - (bb.upper[2]), - decode(bb.lower[0]), - decode(bb.lower[1]), - decode(bb.lower[2]), - decode(bb.upper[0]), - decode(bb.upper[1]), - decode(bb.upper[2])); - } break; - case DONE_NODE: - break; - } - } - } -#endif - int numDone = 0; int numNodes; // ------------------------------------------------------------------ cudaEvent_t stateDownloadedEvent; CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent)); - - -// #if CUBQL_PROFILE -// int pass = 0; -// static Profile t_writeNodes; -// static Profile t_writePrims; -// static Profile t_sortPrims; -// static Profile t_nodePass[100]; -// static Profile t_primPass[100]; -// if (t_writeNodes.name == "") { -// t_writeNodes.setName("writeNodes"); -// t_writePrims.setName("writePrims"); -// t_sortPrims.setName("sortPrims"); -// for (int i=0;i<100;i++) { -// t_nodePass[i].setName("nodePass",i); -// t_primPass[i].setName("primPass",i); -// } -// } -// #endif - PING; while (true) { - PING; - PRINT(numDone); int pre_numNodes = numNodes; CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes, sizeof(numNodes),cudaMemcpyDeviceToHost,s)); CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent)); - PRINT(numNodes); - CUBQL_CUDA_SYNC_CHECK(); CUBQL_CUDA_SYNC_CHECK_STREAM(s); if (numNodes == numDone) break; - PRINT(numNodes-numDone); - bool dbg = (numNodes-numDone) == 8098; -// #if CUBQL_PROFILE -// t_nodePass[pass].sync_start(); -// #endif - CUBQL_CUDA_SYNC_CHECK(); - CUBQL_CUDA_SYNC_CHECK_STREAM(s); -#if 0 - { - std::cout << "*********** PRE SELECT ***********" << std::endl; - std::vector h_nodeStates(numNodes); - std::vector> h_tempNodes(numNodes); - CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, - numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); - CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, - numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); - for (int i=0;i>> (buildState, nodeStates,tempNodes,numNodes, buildConfig); - CUBQL_CUDA_SYNC_CHECK(); - CUBQL_CUDA_SYNC_CHECK_STREAM(s); -#if 0 - { - std::vector h_nodeStates(numNodes); - std::vector> h_tempNodes(numNodes); -CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates, - numNodes*sizeof(*nodeStates),cudaMemcpyDefault)); -CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, - numNodes*sizeof(*tempNodes),cudaMemcpyDefault)); - for (int i=0;i centBounds; - // uint32_t count; - // uint32_t unused; - // } openBranch; - // struct { - // uint32_t offset; - // int dim; - // uint32_t tieBreaker; - // float pos; - // } openNode; - // struct { - // uint32_t offset; - // uint32_t count; - // uint32_t unused[2]; - // } doneNode; - } - } -#endif - -// #if CUBQL_PROFILE -// t_nodePass[pass].sync_stop(); -// t_primPass[pass].sync_start(); -// #endif - numDone = numNodes; -// #if 1 - if (0 && sizeof(T)*D <= sizeof(float3)) { + if (1 && sizeof(T)*D <= sizeof(float3)) { updatePrims_shm<<>> (nodeStates,tempNodes, primStates,boxes,numPrims,numDone); } else -// #else updatePrims<<>> (nodeStates,tempNodes, primStates,boxes,numPrims); -// #endif - - CUBQL_CUDA_SYNC_CHECK(); - CUBQL_CUDA_SYNC_CHECK_STREAM(s); - -// #if CUBQL_PROFILE -// t_primPass[pass].sync_stop(); -// ++ pass; -// #endif } - PING; CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent)); // ================================================================== // sort {item,nodeID} list @@ -786,10 +487,6 @@ CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, uint8_t *d_temp_storage = NULL; size_t temp_storage_bytes = 0; PrimState *sortedPrimStates = 0; -// #if CUBQL_PROFILE -// t_sortPrims.sync_start(); -// #endif - PING; _ALLOC(sortedPrimStates,numPrims,s,memResource); auto rc = cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes, @@ -804,25 +501,15 @@ CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, numPrims,32,64,s); rc = rc; _FREE(d_temp_storage,s,memResource); -// #if CUBQL_PROFILE -// t_sortPrims.sync_stop(); -// t_writePrims.sync_start(); -// #endif // ================================================================== // allocate and write BVH item list, and write offsets of leaf nodes // ================================================================== - PING; bvh.numPrims = numPrims; _ALLOC(bvh.primIDs,numPrims,s,memResource); writePrimsAndLeafOffsets<<>> (tempNodes,bvh.primIDs,sortedPrimStates,numPrims); -// #if CUBQL_PROFILE -// t_writePrims.sync_stop(); -// t_writeNodes.sync_start(); -// #endif - PING; // ================================================================== // allocate and write final nodes // ================================================================== @@ -830,15 +517,11 @@ CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes, _ALLOC(bvh.nodes,numNodes,s,memResource); writeNodes<<>> (bvh.nodes,tempNodes,numNodes); -// #if CUBQL_PROFILE -// t_writeNodes.sync_stop(); -// #endif _FREE(sortedPrimStates,s,memResource); _FREE(tempNodes,s,memResource); _FREE(nodeStates,s,memResource); _FREE(primStates,s,memResource); _FREE(buildState,s,memResource); - PING; } } // ::cuBQL::gpuBuilder_impl From fd5efa2858e221c551a4f870bc9c33bb2584e777 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 14 Jun 2026 13:32:04 -0600 Subject: [PATCH 5/9] fixed misplaced 'volatile's --- cuBQL/builder/cuda/builder_common.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h index 3e080c7..63a9a33 100644 --- a/cuBQL/builder/cuda/builder_common.h +++ b/cuBQL/builder/cuda/builder_common.h @@ -23,7 +23,7 @@ namespace cub { namespace cuBQL { namespace gpuBuilder_impl { - inline __device__ void atomic_min(volatile int32_t *v, int32_t vv) + inline __device__ void atomic_min(int32_t *v, int32_t vv) { atomicMin((int *)v,(int)vv); } inline __device__ void atomic_min(int64_t *v, int64_t vv) { atomicMin((long long *)v,(long long)vv); } @@ -32,7 +32,7 @@ namespace cuBQL { inline __device__ void atomic_min(uint64_t *v, uint64_t vv) { atomicMin((unsigned long long *)v,(unsigned long long)vv); } - inline __device__ void atomic_max(volatile int32_t *v, int32_t vv) + inline __device__ void atomic_max(int32_t *v, int32_t vv) { atomicMax((int *)v,(int)vv); } inline __device__ void atomic_max(int64_t *v, int64_t vv) { atomicMax((long long *)v,(long long)vv); } @@ -93,7 +93,7 @@ namespace cuBQL { template<> inline __device__ int32_t encode(float f) { - const int32_t sign = 0x80000000U; + const int32_t sign = 0x80000000; int32_t bits = __float_as_int(f); if (bits & sign) bits ^= 0x7fffffff; return bits; @@ -237,9 +237,9 @@ namespace cuBQL { const typename int_type_of::type enc = encode(other[d]); if (enc < abox.lower[d]) - atomic_min((volatile int32_t*)abox.lower+d,enc); + atomic_min(abox.lower+d,enc); if (enc > abox.upper[d]) - atomic_max((volatile int32_t*)abox.upper+d,enc); + atomic_max(abox.upper+d,enc); } } From ad099c4dbb87cb397ca0b7ee20a0c264569b880b Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 14 Jun 2026 13:38:27 -0600 Subject: [PATCH 6/9] more cleanups --- cuBQL/builder/cuda/sah_builder.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cuBQL/builder/cuda/sah_builder.h b/cuBQL/builder/cuda/sah_builder.h index 4ba5277..423d6fb 100644 --- a/cuBQL/builder/cuda/sah_builder.h +++ b/cuBQL/builder/cuda/sah_builder.h @@ -554,14 +554,14 @@ namespace cuBQL { GpuMemoryResource& memResource) { real3_sahBuilder(bvh,boxes,_numPrims,buildConfig,s,memResource); } - // template<> - // inline void sahBuilder(BinaryBVH &bvh, - // const box_t *boxes, - // uint32_t _numPrims, - // BuildConfig buildConfig, - // cudaStream_t s, - // GpuMemoryResource& memResource) - // { real3_sahBuilder(bvh,boxes,_numPrims,buildConfig,s,memResource); } + template<> + inline void sahBuilder(BinaryBVH &bvh, + const box_t *boxes, + uint32_t _numPrims, + BuildConfig buildConfig, + cudaStream_t s, + GpuMemoryResource& memResource) + { real3_sahBuilder(bvh,boxes,_numPrims,buildConfig,s,memResource); } } // ::cuBQL::sahBuilder_impl namespace cuda { From cff482ae4b18c737505ed1725a934b363e7549c0 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 14 Jun 2026 13:40:11 -0600 Subject: [PATCH 7/9] Updated windows github runner config --- .github/workflows/Windows.yml | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml index 17bdf08..c03ae67 100644 --- a/.github/workflows/Windows.yml +++ b/.github/workflows/Windows.yml @@ -23,26 +23,16 @@ jobs: visual-studio: "Visual Studio 17 2022" shell: "powershell" config: Release - - os: windows-2022 - cuda: "13.0.0" - visual-studio: "Visual Studio 17 2022" - shell: "powershell" - config: Release - - os: windows-2025 - cuda: "12.6.2" - visual-studio: "Visual Studio 17 2022" - shell: "powershell" - config: Release - os: windows-2025 - cuda: "13.0.0" - visual-studio: "Visual Studio 17 2022" + cuda: "13.2.0" + visual-studio: "Visual Studio 18 2026" shell: "powershell" config: Release - os: windows-2025 - cuda: "13.0.2" - visual-studio: "Visual Studio 17 2022" + cuda: "13.2.0" + visual-studio: "Visual Studio 18 2026" shell: "powershell" - config: Release + config: Debug env: cmake_generator: "${{ matrix.visual-studio }}" From 81e2751d7fce2b7c235a28f5ad5c0bcee3d02361 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 14 Jun 2026 14:01:20 -0600 Subject: [PATCH 8/9] fixed missing CUDART_INF_F include --- cuBQL/builder/cuda/sm_builder.h | 1 - cuBQL/math/constants.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h index 30ef82c..cf68ada 100644 --- a/cuBQL/builder/cuda/sm_builder.h +++ b/cuBQL/builder/cuda/sm_builder.h @@ -453,7 +453,6 @@ namespace cuBQL { cudaEvent_t stateDownloadedEvent; CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent)); while (true) { - int pre_numNodes = numNodes; CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes, sizeof(numNodes),cudaMemcpyDeviceToHost,s)); CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s)); diff --git a/cuBQL/math/constants.h b/cuBQL/math/constants.h index 488ad7f..4b5c3c4 100644 --- a/cuBQL/math/constants.h +++ b/cuBQL/math/constants.h @@ -6,7 +6,7 @@ #include #include #if defined(__CUDACC__) && !defined(CUDART_INF_F) -// #include +#include #endif #ifndef M_PI From 59382763f2bd8ed9a12e148a859a8128276d14b7 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Sun, 14 Jun 2026 14:04:42 -0600 Subject: [PATCH 9/9] added /Zc:preprocessor for windows/vs2026 - won't compile without that --- cuBQL/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt index 3a22714..5f2c7d8 100644 --- a/cuBQL/CMakeLists.txt +++ b/cuBQL/CMakeLists.txt @@ -6,6 +6,15 @@ # in one of his/her source files add_library(cuBQL INTERFACE) +if (WIN32) + target_compile_options(cuBQL INTERFACE + $<$:-Xcompiler="/Zc:preprocessor"> + ) +# target_compile_options(${PTX_TARGET} PRIVATE +# $<$:-std=c++17> +# ) +endif() + target_sources(cuBQL INTERFACE # main public "interface" to this library bvh.h