NVIDIA · iwald-nvidia · Jun 14, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml
@@ -23,26 +23,16 @@ jobs:
             visual-studio: "Visual Studio 17 2022"
             shell: "powershell"
             config: Release
-          - os: windows-2022
-            cuda: "13.0.0"
-            visual-studio: "Visual Studio 17 2022"
-            shell: "powershell"
-            config: Release
-          - os: windows-2025
-            cuda: "12.6.2"
-            visual-studio: "Visual Studio 17 2022"
-            shell: "powershell"
-            config: Release
           - os: windows-2025
-            cuda: "13.0.0"
-            visual-studio: "Visual Studio 17 2022"
+            cuda: "13.2.0"
+            visual-studio: "Visual Studio 18 2026"
             shell: "powershell"
             config: Release
           - os: windows-2025
-            cuda: "13.0.2"
-            visual-studio: "Visual Studio 17 2022"
+            cuda: "13.2.0"
+            visual-studio: "Visual Studio 18 2026"
             shell: "powershell"
-            config: Release
+            config: Debug
 
     env:
       cmake_generator: "${{ matrix.visual-studio }}"

diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt
@@ -6,6 +6,15 @@
 # in one of his/her source files
 add_library(cuBQL INTERFACE)
 
+if (WIN32)
+  target_compile_options(cuBQL INTERFACE
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/Zc:preprocessor">
+  )
+#  target_compile_options(${PTX_TARGET} PRIVATE
+#    $<$<COMPILE_LANGUAGE:CUDA>:-std=c++17>
+#  )
+endif()
+
 target_sources(cuBQL INTERFACE
   # main public "interface" to this library
   bvh.h

diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h
@@ -132,6 +132,21 @@ namespace cuBQL {
           if (box.empty()) continue;
           primIDs.push_back(i);
         }
+
+        if (primIDs.empty()) {
+          // if we had no valid input prims whatsoever
+          bvh.nodes = new typename BinaryBVH<T,D>::Node[1];
+          bvh.nodes[0].bounds = box3f();
+          bvh.nodes[0].admin.offset = 0;
+          bvh.nodes[0].admin.count = 1;
+          bvh.primIDs = new uint32_t[numPrims];
+          for (int i=0;i<numPrims;i++)
+            bvh.primIDs[i] = i;
+          bvh.numPrims = numPrims;
+          bvh.numNodes = 1;
+          return;
+        }
+
         std::vector<int>  altPrimIDs(primIDs.size());
         std::vector<Topo> topo(1);
 

diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h
@@ -45,6 +45,7 @@ namespace cuBQL {
     DeviceMemoryResource()
     {}
     void malloc(void** ptr, size_t size, cudaStream_t s) override {
+      CUBQL_CUDA_CALL(StreamSynchronize(s));
       CUBQL_CUDA_CALL(Malloc(ptr, size));
     }
     void free(void* ptr, cudaStream_t s) override
@@ -76,7 +77,10 @@ namespace cuBQL {
           cudaMemPool_t mempool;
           CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice));
           uint64_t threshold = UINT64_MAX;
-          CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
+          CUBQL_CUDA_CALL
+            (MemPoolSetAttribute(mempool,
+                                 cudaMemPoolAttrReleaseThreshold,
+                                 &threshold));
         }
       } );
     }

diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h
@@ -98,6 +98,7 @@ namespace cuBQL {
       if (bits & sign) bits ^= 0x7fffffff;
       return bits;
     }
+
     template<> inline __device__
     int64_t encode(double f)
     {
@@ -133,9 +134,11 @@ namespace cuBQL {
       if (bits & sign) bits ^= 0x7fffffff;
       return __int_as_float(bits);
     }
+
     template<> inline __device__
     int32_t decode<int32_t>(int32_t bits)
     { return bits; }
+
     template<> inline __device__
     int64_t decode<int64_t>(int64_t bits)
     { return bits; }
@@ -225,23 +228,24 @@ namespace cuBQL {
     }
 
     template<typename box_t> inline __device__
-    void atomic_grow(AtomicBox<box_t> &abox, const typename box_t::vec_t &other)
+    void atomic_grow(AtomicBox<box_t> &abox,
+                     typename box_t::vec_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
       for (int d=0;d<box_t::numDims;d++) {
         const typename int_type_of<scalar_t>::type enc
-          = //AtomicBox<box_t>::
-          encode(other[d]);//get(other,d));
+          = encode(other[d]);
         if (enc < abox.lower[d])
-          atomic_min(&abox.lower[d],enc);
+          atomic_min(abox.lower+d,enc);
         if (enc > abox.upper[d])
-          atomic_max(&abox.upper[d],enc);
+          atomic_max(abox.upper+d,enc);
       }
     } 
 
     template<typename box_t>
-    inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const box_t &other)
+    inline __device__
+    void atomic_grow(AtomicBox<box_t> &abox, box_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
@@ -252,13 +256,14 @@ namespace cuBQL {
         const typename int_type_of<scalar_t>::type 
           enc_upper = //AtomicBox<box_t>::
           encode(other.get_upper(d));
-        if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower);
-        if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper);
+        if (enc_lower < abox.lower[d]) atomic_min(abox.lower+d,enc_lower);
+        if (enc_upper > abox.upper[d]) atomic_max(abox.upper+d,enc_upper);
       }
     }
 
     template<typename box_t>
-    inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const AtomicBox<box_t> &other)
+    inline __device__ void atomic_grow(AtomicBox<box_t> &abox,
+                                       const AtomicBox<box_t> &other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll

diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
@@ -25,13 +25,14 @@ namespace cuBQL {
     };
 
     template<typename T, int D>
-    struct CUBQL_ALIGN(16) TempNode {
+    struct CUBQL_ALIGN(16)
+      TempNode {
       using box_t = cuBQL::box_t<T,D>;
       union {
         struct {
-          AtomicBox<box_t> centBounds;
           uint32_t         count;
           uint32_t         unused;
+          AtomicBox<box_t> centBounds;
         } openBranch;
         struct {
           uint32_t offset;
@@ -42,7 +43,6 @@ namespace cuBQL {
         struct {
           uint32_t offset;
           uint32_t count;
-          uint32_t unused[2];
         } doneNode;
       };
     };
@@ -82,7 +82,8 @@ namespace cuBQL {
         me.done   = false;
         // this could be made faster by block-reducing ...
         atomicAdd(&nodes[0].openBranch.count,1);
-        atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box));
+        auto ctr = box.center();
+        atomic_grow(nodes[0].openBranch.centBounds,ctr);
       } else {
         me.nodeID = (uint32_t)-1;
         me.done   = true;
@@ -208,13 +209,14 @@ namespace cuBQL {
 
       auto in = nodes[nodeID].openBranch;
       if (in.count <= buildConfig.makeLeafThreshold) {
-        auto &done  = nodes[nodeID].doneNode;
+        auto done  = nodes[nodeID].doneNode;
         done.count  = in.count;
         // set this to max-value, so the prims can later do atomicMin
         // with their position ion the leaf list; this value is
         // greater than any prim position.
         done.offset = (uint32_t)-1;
         nodeState   = DONE_NODE;
+        nodes[nodeID].doneNode = done;
       } else {
         float widestWidth = 0.f;
         int   widestDim   = -1;
@@ -235,7 +237,8 @@ namespace cuBQL {
           widestCtr = ctr;
         }
 
-        auto &open = nodes[nodeID].openNode;
+        // auto &open = nodes[nodeID].openNode;
+        auto open = nodes[nodeID].openNode;
         if (widestDim >= 0) {
           open.pos = widestCtr;
         }
@@ -249,12 +252,15 @@ namespace cuBQL {
 #pragma unroll
         for (int side=0;side<2;side++) {
           const int childID = open.offset+side;
-          auto &child = nodes[childID].openBranch;
-          child.centBounds.set_empty();
-          child.count         = 0;
+          TempNode<T,D> child;
+          // auto &child = nodes[childID].openBranch;
+          child.openBranch.centBounds.set_empty();
+          child.openBranch.count         = 0;
           nodeStates[childID] = OPEN_BRANCH;
+          nodes[childID] = child;
         }
         nodeState = OPEN_NODE;
+        nodes[nodeID].openNode = open;
       }
 #endif
     }
@@ -420,7 +426,6 @@ namespace cuBQL {
                GpuMemoryResource &memResource)
     {
       assert(sizeof(PrimState) == sizeof(uint64_t));
-
       // ==================================================================
       // do build on temp nodes
       // ==================================================================
@@ -432,10 +437,12 @@ namespace cuBQL {
       _ALLOC(nodeStates,2*numPrims,s,memResource);
       _ALLOC(primStates,numPrims,s,memResource);
       _ALLOC(buildState,1,s,memResource);
-      initState<<<1,1,0,s>>>(buildState,
-                             nodeStates,
-                             tempNodes);
-      initPrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+
+      initState<<<1,128,0,s>>>(buildState,
+                               nodeStates,
+                               tempNodes);
+
+      initPrims<<<divRoundUp(numPrims,128),128,0,s>>>
         (tempNodes,
          primStates,boxes,numPrims);
 
@@ -445,61 +452,30 @@ namespace cuBQL {
       // ------------------------------------------------------------------      
       cudaEvent_t stateDownloadedEvent;
       CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
-
-
-#if CUBQL_PROFILE
-      int pass = 0;
-      static Profile t_writeNodes;
-      static Profile t_writePrims;
-      static Profile t_sortPrims;
-      static Profile t_nodePass[100];
-      static Profile t_primPass[100];
-      if (t_writeNodes.name == "") {
-        t_writeNodes.setName("writeNodes");
-        t_writePrims.setName("writePrims");
-        t_sortPrims.setName("sortPrims");
-        for (int i=0;i<100;i++) {
-          t_nodePass[i].setName("nodePass",i);
-          t_primPass[i].setName("primPass",i);
-        }
-      }
-#endif
       while (true) {
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
-        if (numNodes == numDone)
-          break;
         CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
         CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
-#if CUBQL_PROFILE
-        t_nodePass[pass].sync_start();
-#endif
+        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+        if (numNodes == numDone)
+          break;
+
         selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
           (buildState,
            nodeStates,tempNodes,numNodes,
            buildConfig);
-#if CUBQL_PROFILE
-        t_nodePass[pass].sync_stop();
-        t_primPass[pass].sync_start();
-#endif
+
         numDone = numNodes;
 
-// #if 1
-        if (sizeof(T)*D <= sizeof(float3)) {
+        if (1 && sizeof(T)*D <= sizeof(float3)) {
           updatePrims_shm<<<divRoundUp(numPrims,512),512,0,s>>>
             (nodeStates,tempNodes,
              primStates,boxes,numPrims,numDone);
         } else 
-// #else
-        updatePrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+        updatePrims<<<divRoundUp(numPrims,128),128,0,s>>>
           (nodeStates,tempNodes,
            primStates,boxes,numPrims);
-// #endif
-
-#if CUBQL_PROFILE
-        t_primPass[pass].sync_stop();
-        ++ pass;
-#endif
       }
       CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
       // ==================================================================
@@ -510,9 +486,6 @@ namespace cuBQL {
       uint8_t   *d_temp_storage     = NULL;
       size_t     temp_storage_bytes = 0;
       PrimState *sortedPrimStates   = 0;
-#if CUBQL_PROFILE
-      t_sortPrims.sync_start();
-#endif
       _ALLOC(sortedPrimStates,numPrims,s,memResource);
       auto rc =
         cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
@@ -527,10 +500,6 @@ namespace cuBQL {
                                        numPrims,32,64,s);
       rc = rc;
       _FREE(d_temp_storage,s,memResource);
-#if CUBQL_PROFILE
-      t_sortPrims.sync_stop();
-      t_writePrims.sync_start();
-#endif
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
@@ -539,10 +508,6 @@ namespace cuBQL {
       _ALLOC(bvh.primIDs,numPrims,s,memResource);
       writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
         (tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
-#if CUBQL_PROFILE
-      t_writePrims.sync_stop();
-      t_writeNodes.sync_start();
-#endif
 
       // ==================================================================
       // allocate and write final nodes
@@ -551,9 +516,6 @@ namespace cuBQL {
       _ALLOC(bvh.nodes,numNodes,s,memResource);
       writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
         (bvh.nodes,tempNodes,numNodes);
-#if CUBQL_PROFILE
-      t_writeNodes.sync_stop();
-#endif
       _FREE(sortedPrimStates,s,memResource);
       _FREE(tempNodes,s,memResource);
       _FREE(nodeStates,s,memResource);