Skip to content
20 changes: 5 additions & 15 deletions .github/workflows/Windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,16 @@ jobs:
visual-studio: "Visual Studio 17 2022"
shell: "powershell"
config: Release
- os: windows-2022
cuda: "13.0.0"
visual-studio: "Visual Studio 17 2022"
shell: "powershell"
config: Release
- os: windows-2025
cuda: "12.6.2"
visual-studio: "Visual Studio 17 2022"
shell: "powershell"
config: Release
- os: windows-2025
cuda: "13.0.0"
visual-studio: "Visual Studio 17 2022"
cuda: "13.2.0"
visual-studio: "Visual Studio 18 2026"
shell: "powershell"
config: Release
- os: windows-2025
cuda: "13.0.2"
visual-studio: "Visual Studio 17 2022"
cuda: "13.2.0"
visual-studio: "Visual Studio 18 2026"
shell: "powershell"
config: Release
config: Debug

env:
cmake_generator: "${{ matrix.visual-studio }}"
Expand Down
9 changes: 9 additions & 0 deletions cuBQL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@
# in one of his/her source files
add_library(cuBQL INTERFACE)

if (WIN32)
target_compile_options(cuBQL INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/Zc:preprocessor">
)
# target_compile_options(${PTX_TARGET} PRIVATE
# $<$<COMPILE_LANGUAGE:CUDA>:-std=c++17>
# )
endif()

target_sources(cuBQL INTERFACE
# main public "interface" to this library
bvh.h
Expand Down
15 changes: 15 additions & 0 deletions cuBQL/builder/cpu/spatialMedian.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,21 @@ namespace cuBQL {
if (box.empty()) continue;
primIDs.push_back(i);
}

if (primIDs.empty()) {
// if we had no valid input prims whatsoever
bvh.nodes = new typename BinaryBVH<T,D>::Node[1];
bvh.nodes[0].bounds = box3f();
bvh.nodes[0].admin.offset = 0;
bvh.nodes[0].admin.count = 1;
bvh.primIDs = new uint32_t[numPrims];
for (int i=0;i<numPrims;i++)
bvh.primIDs[i] = i;
bvh.numPrims = numPrims;
bvh.numNodes = 1;
return;
}

std::vector<int> altPrimIDs(primIDs.size());
std::vector<Topo> topo(1);

Expand Down
6 changes: 5 additions & 1 deletion cuBQL/builder/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ namespace cuBQL {
DeviceMemoryResource()
{}
void malloc(void** ptr, size_t size, cudaStream_t s) override {
CUBQL_CUDA_CALL(StreamSynchronize(s));
CUBQL_CUDA_CALL(Malloc(ptr, size));
}
void free(void* ptr, cudaStream_t s) override
Expand Down Expand Up @@ -76,7 +77,10 @@ namespace cuBQL {
cudaMemPool_t mempool;
CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice));
uint64_t threshold = UINT64_MAX;
CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
CUBQL_CUDA_CALL
(MemPoolSetAttribute(mempool,
cudaMemPoolAttrReleaseThreshold,
&threshold));
}
} );
}
Expand Down
23 changes: 14 additions & 9 deletions cuBQL/builder/cuda/builder_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ namespace cuBQL {
if (bits & sign) bits ^= 0x7fffffff;
return bits;
}

template<> inline __device__
int64_t encode(double f)
{
Expand Down Expand Up @@ -133,9 +134,11 @@ namespace cuBQL {
if (bits & sign) bits ^= 0x7fffffff;
return __int_as_float(bits);
}

template<> inline __device__
int32_t decode<int32_t>(int32_t bits)
{ return bits; }

template<> inline __device__
int64_t decode<int64_t>(int64_t bits)
{ return bits; }
Expand Down Expand Up @@ -225,23 +228,24 @@ namespace cuBQL {
}

template<typename box_t> inline __device__
void atomic_grow(AtomicBox<box_t> &abox, const typename box_t::vec_t &other)
void atomic_grow(AtomicBox<box_t> &abox,
typename box_t::vec_t other)
{
using scalar_t = typename AtomicBox<box_t>::scalar_t;
#pragma unroll
for (int d=0;d<box_t::numDims;d++) {
const typename int_type_of<scalar_t>::type enc
= //AtomicBox<box_t>::
encode(other[d]);//get(other,d));
= encode(other[d]);
if (enc < abox.lower[d])
atomic_min(&abox.lower[d],enc);
atomic_min(abox.lower+d,enc);
if (enc > abox.upper[d])
atomic_max(&abox.upper[d],enc);
atomic_max(abox.upper+d,enc);
}
}

template<typename box_t>
inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const box_t &other)
inline __device__
void atomic_grow(AtomicBox<box_t> &abox, box_t other)
{
using scalar_t = typename AtomicBox<box_t>::scalar_t;
#pragma unroll
Expand All @@ -252,13 +256,14 @@ namespace cuBQL {
const typename int_type_of<scalar_t>::type
enc_upper = //AtomicBox<box_t>::
encode(other.get_upper(d));
if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower);
if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper);
if (enc_lower < abox.lower[d]) atomic_min(abox.lower+d,enc_lower);
if (enc_upper > abox.upper[d]) atomic_max(abox.upper+d,enc_upper);
}
}

template<typename box_t>
inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const AtomicBox<box_t> &other)
inline __device__ void atomic_grow(AtomicBox<box_t> &abox,
const AtomicBox<box_t> &other)
{
using scalar_t = typename AtomicBox<box_t>::scalar_t;
#pragma unroll
Expand Down
94 changes: 28 additions & 66 deletions cuBQL/builder/cuda/sm_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ namespace cuBQL {
};

template<typename T, int D>
struct CUBQL_ALIGN(16) TempNode {
struct CUBQL_ALIGN(16)
TempNode {
using box_t = cuBQL::box_t<T,D>;
union {
struct {
AtomicBox<box_t> centBounds;
uint32_t count;
uint32_t unused;
AtomicBox<box_t> centBounds;
} openBranch;
struct {
uint32_t offset;
Expand All @@ -42,7 +43,6 @@ namespace cuBQL {
struct {
uint32_t offset;
uint32_t count;
uint32_t unused[2];
} doneNode;
};
};
Expand Down Expand Up @@ -82,7 +82,8 @@ namespace cuBQL {
me.done = false;
// this could be made faster by block-reducing ...
atomicAdd(&nodes[0].openBranch.count,1);
atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box));
auto ctr = box.center();
atomic_grow(nodes[0].openBranch.centBounds,ctr);
} else {
me.nodeID = (uint32_t)-1;
me.done = true;
Expand Down Expand Up @@ -208,13 +209,14 @@ namespace cuBQL {

auto in = nodes[nodeID].openBranch;
if (in.count <= buildConfig.makeLeafThreshold) {
auto &done = nodes[nodeID].doneNode;
auto done = nodes[nodeID].doneNode;
done.count = in.count;
// set this to max-value, so the prims can later do atomicMin
// with their position ion the leaf list; this value is
// greater than any prim position.
done.offset = (uint32_t)-1;
nodeState = DONE_NODE;
nodes[nodeID].doneNode = done;
} else {
float widestWidth = 0.f;
int widestDim = -1;
Expand All @@ -235,7 +237,8 @@ namespace cuBQL {
widestCtr = ctr;
}

auto &open = nodes[nodeID].openNode;
// auto &open = nodes[nodeID].openNode;
auto open = nodes[nodeID].openNode;
if (widestDim >= 0) {
open.pos = widestCtr;
}
Expand All @@ -249,12 +252,15 @@ namespace cuBQL {
#pragma unroll
for (int side=0;side<2;side++) {
const int childID = open.offset+side;
auto &child = nodes[childID].openBranch;
child.centBounds.set_empty();
child.count = 0;
TempNode<T,D> child;
// auto &child = nodes[childID].openBranch;
child.openBranch.centBounds.set_empty();
child.openBranch.count = 0;
nodeStates[childID] = OPEN_BRANCH;
nodes[childID] = child;
}
nodeState = OPEN_NODE;
nodes[nodeID].openNode = open;
}
#endif
}
Expand Down Expand Up @@ -420,7 +426,6 @@ namespace cuBQL {
GpuMemoryResource &memResource)
{
assert(sizeof(PrimState) == sizeof(uint64_t));

// ==================================================================
// do build on temp nodes
// ==================================================================
Expand All @@ -432,10 +437,12 @@ namespace cuBQL {
_ALLOC(nodeStates,2*numPrims,s,memResource);
_ALLOC(primStates,numPrims,s,memResource);
_ALLOC(buildState,1,s,memResource);
initState<<<1,1,0,s>>>(buildState,
nodeStates,
tempNodes);
initPrims<<<divRoundUp(numPrims,1024),1024,0,s>>>

initState<<<1,128,0,s>>>(buildState,
nodeStates,
tempNodes);

initPrims<<<divRoundUp(numPrims,128),128,0,s>>>
(tempNodes,
primStates,boxes,numPrims);

Expand All @@ -445,61 +452,30 @@ namespace cuBQL {
// ------------------------------------------------------------------
cudaEvent_t stateDownloadedEvent;
CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));


#if CUBQL_PROFILE
int pass = 0;
static Profile t_writeNodes;
static Profile t_writePrims;
static Profile t_sortPrims;
static Profile t_nodePass[100];
static Profile t_primPass[100];
if (t_writeNodes.name == "") {
t_writeNodes.setName("writeNodes");
t_writePrims.setName("writePrims");
t_sortPrims.setName("sortPrims");
for (int i=0;i<100;i++) {
t_nodePass[i].setName("nodePass",i);
t_primPass[i].setName("primPass",i);
}
}
#endif
while (true) {
CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
sizeof(numNodes),cudaMemcpyDeviceToHost,s));
if (numNodes == numDone)
break;
CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
#if CUBQL_PROFILE
t_nodePass[pass].sync_start();
#endif
CUBQL_CUDA_SYNC_CHECK_STREAM(s);
if (numNodes == numDone)
break;

selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
(buildState,
nodeStates,tempNodes,numNodes,
buildConfig);
#if CUBQL_PROFILE
t_nodePass[pass].sync_stop();
t_primPass[pass].sync_start();
#endif

numDone = numNodes;

// #if 1
if (sizeof(T)*D <= sizeof(float3)) {
if (1 && sizeof(T)*D <= sizeof(float3)) {
updatePrims_shm<<<divRoundUp(numPrims,512),512,0,s>>>
(nodeStates,tempNodes,
primStates,boxes,numPrims,numDone);
} else
// #else
updatePrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
updatePrims<<<divRoundUp(numPrims,128),128,0,s>>>
(nodeStates,tempNodes,
primStates,boxes,numPrims);
// #endif

#if CUBQL_PROFILE
t_primPass[pass].sync_stop();
++ pass;
#endif
}
CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
// ==================================================================
Expand All @@ -510,9 +486,6 @@ namespace cuBQL {
uint8_t *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
PrimState *sortedPrimStates = 0;
#if CUBQL_PROFILE
t_sortPrims.sync_start();
#endif
_ALLOC(sortedPrimStates,numPrims,s,memResource);
auto rc =
cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
Expand All @@ -527,10 +500,6 @@ namespace cuBQL {
numPrims,32,64,s);
rc = rc;
_FREE(d_temp_storage,s,memResource);
#if CUBQL_PROFILE
t_sortPrims.sync_stop();
t_writePrims.sync_start();
#endif
// ==================================================================
// allocate and write BVH item list, and write offsets of leaf nodes
// ==================================================================
Expand All @@ -539,10 +508,6 @@ namespace cuBQL {
_ALLOC(bvh.primIDs,numPrims,s,memResource);
writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
(tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
#if CUBQL_PROFILE
t_writePrims.sync_stop();
t_writeNodes.sync_start();
#endif

// ==================================================================
// allocate and write final nodes
Expand All @@ -551,9 +516,6 @@ namespace cuBQL {
_ALLOC(bvh.nodes,numNodes,s,memResource);
writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
(bvh.nodes,tempNodes,numNodes);
#if CUBQL_PROFILE
t_writeNodes.sync_stop();
#endif
_FREE(sortedPrimStates,s,memResource);
_FREE(tempNodes,s,memResource);
_FREE(nodeStates,s,memResource);
Expand Down
Loading